# Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

from os.path import exists
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

sns.set(style='whitegrid')

# Loading the Files

Download the csv files into the `data/` directory.

In [None]:
trainingSet = pd.read_csv("../input/dataset/train.csv")
testingSet = pd.read_csv("../input/dataset/test.csv")

#print("train.csv shape is ", trainingSet.shape)
#print("test.csv shape is ", testingSet.shape)

#print()

#print(trainingSet.head())
#print()
#print(testingSet.head())

#print()

#print(trainingSet.describe())

#trainingSet['Score'].value_counts().plot(kind='bar', legend=True, alpha=.5)
#plt.show()

#print()
#print("EVERYTHING IS PROPERLY SET UP! YOU ARE READY TO START")

# Sentiment Score Calculation
To capture the sentiment of each review, I created a simple sentiment score. This score is based on counting occurrences of predefined positive and negative words in the review text (the top 10 for each category were just words that I saw most frequently). For each review, the number of positive words is subtracted by the number of negative words. The goal was to assign a numerical value indicating whether the review leans more positively or negatively.

In [None]:
# Define the sentiment_score function separately
def sentiment_score(text):
    positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'liked', 'wonderful', 'best', 'fantastic', 'enjoyed']
    negative_words = ['bad', 'terrible', 'awful', 'hate', 'disliked', 'poor', 'worst', 'boring', 'disappointing', 'waste']
    text = str(text).lower()
    pos_count = sum([text.count(word) for word in positive_words])
    neg_count = sum([text.count(word) for word in negative_words])
    return pos_count - neg_count

# Text Preprocessing
Before performing feature extraction, I applied basic preprocessing to clean up the review text.
This helps eliminate irrelevant characters and standardizes the text, improving the effectiveness of TF-IDF feature extraction.

In [None]:
# Define a function to preprocess text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)    # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Adding Features
The add_features_to function is designed to transform the original dataset into a more refined and informative version. The goal of feature engineering is to extract meaningful patterns from the raw data, providing the model with relevant information for better prediction accuracy.


In [None]:
def add_features_to(df):
    # This is where you can do all your feature extraction

    # Handle division by zero and fill NaN values
    df['Helpfulness'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
    df['Helpfulness'] = df['Helpfulness'].fillna(0)

    # Cap extreme values to reduce impact of outliers
    df['HelpfulnessNumerator'] = df['HelpfulnessNumerator'].clip(upper=100)
    df['HelpfulnessDenominator'] = df['HelpfulnessDenominator'].clip(upper=100)

    # Calculate the length of the review text and summary
    df['ReviewLength'] = df['Text'].apply(lambda x: len(str(x).split()))
    df['SummaryLength'] = df['Summary'].apply(lambda x: len(str(x).split()))

    df['SentimentScore'] = df['Summary'].apply(sentiment_score) + df['Text'].apply(sentiment_score)

    # Convert 'Time' to datetime and extract year and month
    df['Time'] = pd.to_datetime(df['Time'], unit='s')
    df['ReviewYear'] = df['Time'].dt.year
    df['ReviewMonth'] = df['Time'].dt.month

    # Combine 'Summary' and 'Text' into a single field
    df['CombinedText'] = df['Summary'].astype(str) + ' ' + df['Text'].astype(str)

    # Preprocess the combined text
    df['CleanedText'] = df['CombinedText'].apply(preprocess_text)

    return df

# Load the feature extracted files if they've already been generated
if exists('/kaggle/working/X_train.csv'):
    X_train = pd.read_csv("/kaggle/working/X_train.csv")
if exists('/kaggle/working/X_submission.csv'):
    X_submission = pd.read_csv("/kaggle/working/X_submission.csv")

else:
    # Process the DataFrame
    train = add_features_to(trainingSet)

    # Merge on Id so that the submission set can have feature columns as well
    X_submission = pd.merge(train, testingSet, left_on='Id', right_on='Id')
    X_submission = X_submission.drop(columns=['Score_x'])
    X_submission = X_submission.rename(columns={'Score_y': 'Score'})

    # The training set is where the score is not null
    X_train =  train[train['Score'].notnull()]

    X_submission.to_csv("/kaggle/working/X_submission.csv", index=False)
    X_train.to_csv("/kaggle/working/X_train.csv", index=False)

# Sample + Split into training and testing set

In [None]:
# Split the training set into training and testing sets
X = X_train.drop(columns=['Score'])
y = X_train['Score']

X_train_split, X_test_split, Y_train_split, Y_test_split = train_test_split(
    X,
    y,
    test_size=1/4.0,
    random_state=0
)

# TF-IDF Vectorization
In this section, we use the TF-IDF technique to convert text data into a numerical format that machine learning models can process. We handle missing values in the 'CleanedText' columns by filling them with empty strings to avoid errors during vectorization. TF-IDF assigns weights to words based on their importance, helping the model focus on relevant terms while ignoring common words. We limit the number of features to 5,000 to capture the most frequent and meaningful words while keeping the model size manageable.

In [None]:
# Ensure 'CleanedText' does not contain NaN values
X_train_split['CleanedText'] = X_train_split['CleanedText'].fillna('')
X_test_split['CleanedText'] = X_test_split['CleanedText'].fillna('')
X_submission['CleanedText'] = X_submission['CleanedText'].fillna('')

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform on training data
tfidf_train = tfidf.fit_transform(X_train_split['CleanedText'])

# Transform validation and submission data
tfidf_test = tfidf.transform(X_test_split['CleanedText'])
tfidf_submission = tfidf.transform(X_submission['CleanedText'])

# Feature Selection

In this section, we select specific numeric features that provide additional context and help our model make better predictions. These features are derived from the text and metadata of each review and include aspects such as helpfulness scores, review lengths, sentiment scores, and temporal information.

We then prepare the numerical feature sets for each dataset (training, testing, and submission) by filling any missing values with zeros. This step ensures that all datasets are aligned and consistent for modeling.

In [None]:
# Select features to include in the model
numeric_features = [
    'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Helpfulness',
    'ReviewLength', 'SummaryLength', 'SentimentScore',
    'ReviewYear', 'ReviewMonth'
]

# Prepare numerical feature sets
X_train_numeric = X_train_split[numeric_features].fillna(0)
X_test_numeric = X_test_split[numeric_features].fillna(0)
X_submission_numeric = X_submission[numeric_features].fillna(0)

# Apply StandardScaler

In [None]:
# Standardize numerical features
scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)
X_submission_numeric_scaled = scaler.transform(X_submission_numeric)

# Combine TF-IDF features and numerical features
X_train_combined = hstack([tfidf_train, X_train_numeric_scaled])
X_test_combined = hstack([tfidf_test, X_test_numeric_scaled])
X_submission_combined = hstack([tfidf_submission, X_submission_numeric_scaled])

# Model Creation

In [None]:
# Initialize and train the Logistic Regression
model = LogisticRegression(random_state=0, max_iter=200, solver='saga', n_jobs=-1)
model.fit(X_train_combined, Y_train_split)

# Predict on the test set
Y_test_predictions = model.predict(X_test_combined)

# Model Evaluation

In [None]:
# Evaluate your model on the testing set
print("Accuracy on testing set = ", accuracy_score(Y_test_split, Y_test_predictions))

# Plot a confusion matrix
cm = confusion_matrix(Y_test_split, Y_test_predictions, normalize='true')
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Create submission file

In [None]:
# Create the submission file
X_submission['Score'] = model.predict(X_submission_combined)
submission = X_submission[['Id', 'Score']]
submission.to_csv("/kaggle/working/submission.csv", index=False)