# Data Cleaning and Preprocessing

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('data/train.csv', usecols=['id', 'VotedHelpful', 'TotalVotes', 'summary', 'reviewText', 'genres', 'Score', 'album_mbid', 'reviewerID', 'artist_mbid'])
df.head()

Unnamed: 0,id,reviewerID,album_mbid,artist_mbid,VotedHelpful,TotalVotes,summary,reviewText,genres,Score
0,1,A0001624UKLQG4OFIM8X,B000002KIC,8c90ad8c-9150-4c51-a1eb-342232e99d06,0,0,very good listening,Ive liked the band since first heard them. Fig...,"Folk Rock,Country Rock,Country,Rock,Pop,Singer...",5.0
1,2,A00082583JGF0RURTDN8A,B000007T1M,cc0b7089-c08d-4c10-b6b0-873582c17fd6,0,0,Best album ever!!!!,I love this album sents it came out!!! This is...,"Alternative Metal,Metal,Pop Metal,Pop,Rock",5.0
2,3,A00162161QSZVJYMHX0T4,B0000001T0,f1f81989-dfa9-4bd3-805e-dcf3900c43e3,0,0,"A great Album , good seller",Bought this used. An awesome country rock albu...,"Smooth Jazz,Jazz,Pop,Jazz Fusion",5.0
3,4,A00162161QSZVJYMHX0T4,B0000001UU,0c361ea5-98c6-4947-900b-201833f2dd84,0,0,Larry and Lee = a future Classic!,This album is sure to become a future classic....,"Smooth Jazz,Jazz,Pop,Easy Listening",
4,5,A00162161QSZVJYMHX0T4,B0000001SB,f1f81989-dfa9-4bd3-805e-dcf3900c43e3,0,0,Wow! Where be Mosada?,"I heard this album a few times on youtube.com,...","Adult Contemporary,Jazz Fusion,Smooth Jazz,Jaz...",5.0


In [4]:
df = df.fillna({'VotedHelpful': 0, 'TotalVotes': 0, 'summary': '', 'reviewText': '', 'genres': ''})

In [5]:
df.to_csv('data/cleaned_train.csv', index=False)

# Feature Engineering

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('data/cleaned_train.csv')

In [2]:
# Simple Features
df['review_length'] = np.log(df['reviewText'].apply(lambda x: len(x) if isinstance(x, str) else 0) + 1)

df['num_album_reviews'] = np.log(df.groupby('album_mbid')['album_mbid'].transform('count') + 1)

df['num_reviewer_reviews'] = np.log(df.groupby('reviewerID')['reviewerID'].transform('count') + 1)

df['num_artist_reviews'] = np.log(df.groupby('artist_mbid')['artist_mbid'].transform('count') + 1)

df['summary_length'] = np.log(df['summary'].apply(lambda x: len(x) if isinstance(x, str) else 0) + 1)

df['helpfulness_ratio'] = df.apply(lambda x: x['VotedHelpful'] / (x['TotalVotes'] + 1), axis=1)

df['num_genres'] = np.log(df['genres'].apply(lambda x: len(x.split(',')) if x else 0) + 1)

df['has_votes'] = (df['TotalVotes'] > 0).astype(int)

df['num_exclamation_marks'] = np.log(df['reviewText'].apply(lambda x: x.count('!') if isinstance(x, str) else 0) + 1)

df['num_caps'] = np.log(df['reviewText'].apply(lambda x: sum(1 for c in x if c.isupper()) if isinstance(x, str) else 0) + 1)

df.head()

Unnamed: 0,id,reviewerID,album_mbid,artist_mbid,VotedHelpful,TotalVotes,summary,reviewText,genres,Score,review_length,num_album_reviews,num_reviewer_reviews,num_artist_reviews,summary_length,helpfulness_ratio,num_genres,has_votes,num_exclamation_marks,num_caps
0,1,A0001624UKLQG4OFIM8X,B000002KIC,8c90ad8c-9150-4c51-a1eb-342232e99d06,0,0,very good listening,Ive liked the band since first heard them. Fig...,"Folk Rock,Country Rock,Country,Rock,Pop,Singer...",5.0,4.770685,5.220356,0.693147,5.375278,2.995732,0.0,2.484907,0,0.0,1.791759
1,2,A00082583JGF0RURTDN8A,B000007T1M,cc0b7089-c08d-4c10-b6b0-873582c17fd6,0,0,Best album ever!!!!,I love this album sents it came out!!! This is...,"Alternative Metal,Metal,Pop Metal,Pop,Rock",5.0,4.672829,5.888878,0.693147,6.575076,2.995732,0.0,1.791759,0,2.197225,1.098612
2,3,A00162161QSZVJYMHX0T4,B0000001T0,f1f81989-dfa9-4bd3-805e-dcf3900c43e3,0,0,"A great Album , good seller",Bought this used. An awesome country rock albu...,"Smooth Jazz,Jazz,Pop,Jazz Fusion",5.0,5.062595,2.397895,1.386294,4.532599,3.332205,0.0,1.609438,0,0.693147,2.079442
3,4,A00162161QSZVJYMHX0T4,B0000001UU,0c361ea5-98c6-4947-900b-201833f2dd84,0,0,Larry and Lee = a future Classic!,This album is sure to become a future classic....,"Smooth Jazz,Jazz,Pop,Easy Listening",,5.525453,2.833213,1.386294,4.969813,3.526361,0.0,1.609438,0,0.0,1.791759
4,5,A00162161QSZVJYMHX0T4,B0000001SB,f1f81989-dfa9-4bd3-805e-dcf3900c43e3,0,0,Wow! Where be Mosada?,"I heard this album a few times on youtube.com,...","Adult Contemporary,Jazz Fusion,Smooth Jazz,Jaz...",5.0,5.83773,1.94591,1.386294,4.532599,3.135494,0.0,1.94591,0,0.693147,2.302585


In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def compute_sentiment(text):
    if not isinstance(text, str) or not text:
        return pd.Series([0.33, 0.33, 0.33], index=['pos', 'neu', 'neg'])
    sentiment = analyzer.polarity_scores(text)
    return pd.Series([sentiment['pos'], sentiment['neu'], sentiment['neg']], index=['pos', 'neu', 'neg'])
df[['review_pos', 'review_neu', 'review_neg']] = df['reviewText'].apply(compute_sentiment)
df[['summary_pos', 'summary_neu', 'summary_neg']] = df['summary'].apply(compute_sentiment)
df.head()

Unnamed: 0,id,reviewerID,album_mbid,artist_mbid,VotedHelpful,TotalVotes,summary,reviewText,genres,Score,...,num_genres,has_votes,num_exclamation_marks,num_caps,review_pos,review_neu,review_neg,summary_pos,summary_neu,summary_neg
0,1,A0001624UKLQG4OFIM8X,B000002KIC,8c90ad8c-9150-4c51-a1eb-342232e99d06,0,0,very good listening,Ive liked the band since first heard them. Fig...,"Folk Rock,Country Rock,Country,Rock,Pop,Singer...",5.0,...,2.484907,0,0.0,1.791759,0.113,0.887,0.0,0.615,0.385,0.0
1,2,A00082583JGF0RURTDN8A,B000007T1M,cc0b7089-c08d-4c10-b6b0-873582c17fd6,0,0,Best album ever!!!!,I love this album sents it came out!!! This is...,"Alternative Metal,Metal,Pop Metal,Pop,Rock",5.0,...,1.791759,0,2.197225,1.098612,0.394,0.606,0.0,0.729,0.271,0.0
2,3,A00162161QSZVJYMHX0T4,B0000001T0,f1f81989-dfa9-4bd3-805e-dcf3900c43e3,0,0,"A great Album , good seller",Bought this used. An awesome country rock albu...,"Smooth Jazz,Jazz,Pop,Jazz Fusion",5.0,...,1.609438,0,0.693147,2.079442,0.374,0.552,0.074,0.636,0.364,0.0
3,4,A00162161QSZVJYMHX0T4,B0000001UU,0c361ea5-98c6-4947-900b-201833f2dd84,0,0,Larry and Lee = a future Classic!,This album is sure to become a future classic....,"Smooth Jazz,Jazz,Pop,Easy Listening",,...,1.609438,0,0.0,1.791759,0.111,0.889,0.0,0.0,1.0,0.0
4,5,A00162161QSZVJYMHX0T4,B0000001SB,f1f81989-dfa9-4bd3-805e-dcf3900c43e3,0,0,Wow! Where be Mosada?,"I heard this album a few times on youtube.com,...","Adult Contemporary,Jazz Fusion,Smooth Jazz,Jaz...",5.0,...,1.94591,0,0.693147,2.302585,0.176,0.77,0.054,0.577,0.423,0.0


In [4]:
from sklearn.preprocessing import StandardScaler

feature_cols = ['review_length', 'num_album_reviews', 'num_reviewer_reviews', 'num_artist_reviews', 'summary_length', 'helpfulness_ratio', 'num_genres', 'has_votes', 'review_pos', 'review_neu', 'review_neg', 'summary_pos', 'summary_neu', 'summary_neg', 'num_exclamation_marks'] 
scaler = StandardScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])
df.drop(columns=['reviewerID', 'artist_mbid', 'album_mbid', 'VotedHelpful', 'TotalVotes'], inplace=True)
df.head()

Unnamed: 0,id,summary,reviewText,genres,Score,review_length,num_album_reviews,num_reviewer_reviews,num_artist_reviews,summary_length,...,num_genres,has_votes,num_exclamation_marks,num_caps,review_pos,review_neu,review_neg,summary_pos,summary_neu,summary_neg
0,1,very good listening,Ive liked the band since first heard them. Fig...,"Folk Rock,Country Rock,Country,Rock,Pop,Singer...",5.0,-1.417379,0.936589,-0.660046,0.209389,-0.209026,...,1.335706,-1.443994,-0.613463,1.791759,-0.80499,1.280407,-0.885179,0.951609,-0.784358,-0.327719
1,2,Best album ever!!!!,I love this album sents it came out!!! This is...,"Alternative Metal,Metal,Pop Metal,Pop,Rock",5.0,-1.519264,1.398354,-0.660046,0.903881,-0.209026,...,-0.505613,-1.443994,2.934139,1.098612,2.057346,-1.68608,-0.885179,1.296698,-1.126359,-0.327719
2,3,"A great Album , good seller",Bought this used. An awesome country rock albu...,"Smooth Jazz,Jazz,Pop,Jazz Fusion",5.0,-1.113449,-1.012954,-0.073685,-0.278388,0.368516,...,-0.989943,-1.443994,0.505681,2.079442,1.853621,-2.256152,0.664902,1.015178,-0.847358,-0.327719
3,4,Larry and Lee = a future Classic!,This album is sure to become a future classic....,"Smooth Jazz,Jazz,Pop,Easy Listening",,-0.631534,-0.712269,-0.073685,-0.02531,0.701778,...,-0.989943,-1.443994,-0.613463,1.791759,-0.825362,1.301521,-0.885179,-0.910055,1.060647,-0.327719
4,5,Wow! Where be Mosada?,"I heard this album a few times on youtube.com,...","Adult Contemporary,Jazz Fusion,Smooth Jazz,Jaz...",5.0,-0.306399,-1.325151,-0.073685,-0.278388,0.03087,...,-0.096117,-1.443994,0.505681,2.302585,-0.163256,0.045251,0.245961,0.83658,-0.670358,-0.327719


In [5]:
df.to_csv('data/engineered_features_train.csv', index=False)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
df = pd.read_csv('data/engineered_features_train.csv')
df['reviewText'] = df['reviewText'].fillna('')
df['summary'] = df['summary'].fillna('')
df['text'] = df['reviewText'] + ' ' + df['summary']

vectorizer = TfidfVectorizer(
    max_features=22000, 
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.95,
    stop_words='english')
corpus = df['text'].tolist()
X_tfidf = vectorizer.fit_transform(corpus)
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df['id'] = df['id'].values
tfidf_df['Score'] = df['Score'].values
print("TF-IDF feature matrix shape:", X_tfidf.shape)

svd = TruncatedSVD(n_components=130, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)
reduced_df = pd.DataFrame(X_reduced, columns=[f'svd_{i}' for i in range(130)])
reduced_df['id'] = df['id'].values

df_final = pd.merge(df, reduced_df, on=['id'])
assert df_final.shape[0] == df.shape[0], "Row count mismatch after merging SVD features"

df_final = df_final.drop(columns=['reviewText', 'summary', 'text'])

TF-IDF feature matrix shape: (447583, 22000)


In [7]:
corpus = df['genres'].tolist()
vectorizer = TfidfVectorizer(
    max_features=100, 
    ngram_range=(1, 1),
    min_df=2,
    max_df=0.9,
)
X_genre_tfidf = vectorizer.fit_transform(corpus)
genre_tfidf_df = pd.DataFrame(X_genre_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
svd = TruncatedSVD(n_components=20, random_state=42)
X_genre_reduced = svd.fit_transform(X_genre_tfidf)
genre_tfidf_df = pd.DataFrame(X_genre_reduced, columns=[f'genre_svd_{i}' for i in range(20)])
genre_tfidf_df['id'] = df['id'].values
df_final = pd.merge(df_final, genre_tfidf_df, on=['id'])

df_final = df_final.drop(columns=['genres'])
df_final.to_csv('data/final_features.csv', index=False)

# Build Predictive Model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier

df = pd.read_csv('data/tfidf_svd_features.csv')
df.head()

df = df.dropna(subset=['Score'])

X = df.drop(columns=['id', 'Score'])
y = df['Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = BalancedBaggingClassifier(n_estimators=20, estimator=LogisticRegression(C=10.0, max_iter=2000, random_state=42, class_weight='balanced'), random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred, average='macro')
print("Macro F1 Score on test set:", f1)


              precision    recall  f1-score   support

         1.0       0.61      0.69      0.65      7858
         2.0       0.21      0.36      0.27      4683
         3.0       0.28      0.28      0.28      9018
         4.0       0.40      0.41      0.40     16144
         5.0       0.83      0.72      0.77     35814

    accuracy                           0.57     73517
   macro avg       0.47      0.49      0.48     73517
weighted avg       0.61      0.57      0.59     73517

Macro F1 Score on test set: 0.4751475877331462


# Build Submission

In [2]:
test_set = pd.read_csv('data/test.csv')
submission_df = pd.read_csv('data/tfidf_svd_features.csv')
test_ids = test_set['id'].values
submission_df = submission_df[submission_df['id'].isin(test_ids)]
submission_df

Unnamed: 0,id,Score,review_length,num_album_reviews,num_reviewer_reviews,num_artist_reviews,summary_length,helpfulness_ratio,num_genres,has_votes,...,svd_120,svd_121,svd_122,svd_123,svd_124,svd_125,svd_126,svd_127,svd_128,svd_129
3,4,,-0.631534,-0.712269,-0.073685,-0.025310,0.701778,-1.063658,-0.989943,-1.443994,...,0.004374,-0.030048,-0.006078,0.010108,-0.012351,0.012199,-0.011087,-0.011792,-0.023892,0.003345
5,6,,-1.444192,0.098725,-0.660046,0.534847,-1.085839,-1.063658,-0.505613,-1.443994,...,0.001118,0.001602,-0.000312,-0.003727,0.009375,-0.002453,-0.011806,-0.012458,-0.008381,0.018861
14,15,,-1.529040,0.181474,-0.660046,0.622546,0.173992,-1.063658,1.335706,-1.443994,...,-0.031773,-0.019725,-0.020419,0.017053,0.019607,-0.005949,0.013065,-0.018793,0.016225,0.002518
25,26,,-1.167548,-0.418792,-0.660046,-0.973225,-0.125279,0.480818,0.571490,0.692523,...,0.005505,-0.003607,-0.008739,-0.000209,0.016142,-0.003046,0.010213,0.027005,-0.001868,-0.010120
30,31,,-1.365740,-0.055406,-0.660046,-0.661232,1.023120,-1.063658,-0.096117,0.692523,...,0.008585,0.029376,-0.004095,0.021503,-0.010043,-0.006968,-0.004576,0.010261,0.013267,-0.001377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447554,447555,,0.586999,0.366115,-0.660046,0.622546,0.030870,0.480818,-0.989943,0.692523,...,-0.006150,-0.001344,0.023736,-0.015111,-0.022615,0.015773,0.013228,0.023605,0.004746,-0.019978
447557,447558,,0.882011,0.814379,-0.660046,0.381910,1.527350,0.995643,0.258604,0.692523,...,-0.007591,-0.013841,-0.017615,0.011504,-0.026578,0.027592,-0.009224,0.001483,-0.005698,-0.002014
447560,447561,,-0.146335,0.518538,-0.073685,-0.230621,0.892693,-1.063658,-0.096117,0.692523,...,0.029245,0.042426,-0.027481,0.001578,0.048574,0.001965,-0.019551,0.001460,0.023410,-0.008932
447568,447569,,0.144697,-1.557561,-0.660046,2.755716,1.676701,-1.063658,-0.505613,-1.443994,...,0.027430,0.001462,0.024325,-0.008832,-0.031370,-0.022717,-0.020784,-0.016262,-0.018876,0.017309


In [3]:
submission_df = submission_df.drop(columns=['Score'])
test_predictions = model.predict(submission_df.drop(columns=['id']))
# Create submission dataframe - ensure all test IDs are included
submission = pd.DataFrame({
    'id': test_set['id'],
    'Score': test_predictions
})

# If there are any missing IDs, fill them with a default prediction (mode of training data)
missing_ids = set(test_ids) - set(submission_df['id'])
if len(missing_ids) > 0:
    print(f"Warning: {len(missing_ids)} test IDs not found in training data. Using default prediction.")
    default_score = int(y_train.mode()[0])  # Mode of training scores
    missing_rows = pd.DataFrame({
        'id': list(missing_ids),
        'Score': [default_score] * len(missing_ids)
    })
    submission = pd.concat([submission, missing_rows], ignore_index=True)

# Ensure all test IDs are present and in correct order
submission = submission[submission['id'].isin(test_ids)].sort_values('id')
submission = submission[['id', 'Score']]

# Save submission
submission.to_csv('submission.csv', index=False)

In [5]:
submission['Score'].value_counts()

Score
5.0    33861
4.0    18216
3.0     9824
1.0     9536
2.0     8563
Name: count, dtype: int64