## Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Import Data
Data structure and content
* 3 columns:
    1. Rating: the rating given for the review
    2. Summary: summary of the complete review text
    3. Text: the complete text of the review

In [None]:
df = pd.read_csv('Reviews.csv')
df = df.rename(columns={'Score': 'Rating'})
df

## Data Distribution

In [None]:
print('Number of different ratings:', len(pd.unique(df['Rating'])))
ratings = pd.unique(df['Rating'])
ratings.sort(axis=0)
print('Ratings:', ratings)

rating_frequencies = []
for i in range(1,6):
    rating_frequencies.append(df[df.Rating == i].shape[0])

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.set_title('Occurrences of each Rating')
plt.xlabel('Ratings')
plt.ylabel('Frequencies')
bars = ax.bar(ratings,rating_frequencies)
for bar in bars:
  height = bar.get_height()
  label_x_pos = bar.get_x() + bar.get_width() / 2
  ax.text(label_x_pos, height, s=f'{height}', ha='center', va='bottom')
plt.show()

## Data pre-processing

In [None]:
print('Number of data points:', len(df.index))
print('Number of null values in each column:')
print(df.isna().sum())

No null values in important columns Rating and Text

Only 27/568454 rows have null values so remove those rows

In [None]:
df.dropna(inplace = True)

In [None]:
print('Number of data points:', len(df.index))

Change all ratings above 3 to positive and below 3 to negative. Remove reviews with a neutral rating of 3.

In [None]:
df = df.drop(df[df.Rating == 3].index)
df['Rating'] = df['Rating'].apply(lambda x : ('pos') if (x > 3) else ('neg'))
print('Number of data points:', len(df.index))
print(df.head(10))

In [None]:
print('Checking types of df columns:')
df.dtypes

In [None]:
df = df.reset_index(drop=True)
df

Undersampling data to fix class imbalance and to decrease processing time.

In [None]:
# df_pos = df.loc[df['Rating'] == 'pos'].sample(5000, random_state=1)
# df_neg = df.loc[df['Rating'] == 'neg'].sample(5000, random_state=1)
df_pos = df.loc[df['Rating'] == 'pos'].sample(df[df.Rating == 'neg'].shape[0], random_state=1)
df_neg = df.loc[df['Rating'] == 'neg']
df = pd.concat([df_pos, df_neg])
df = df.sample(frac=1, random_state=1).reset_index(drop=True)
df

Split dataset into 67% train and 33% test data. Stratify to keep the same class distribution in the sets

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['Text'],
                                                    df['Rating'],
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify=df['Rating'])

## Model

Vectorize string data: convert text in training and test sets to vectors

In [None]:
veYeah ctorizer = CountVectorizer(binary=True)

x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

In [None]:
# Train model
clf = svm.SVC(kernel='linear')
clf.fit(x_train_vectorized, y_train)

In [None]:
# Evaluate model
print("Accuracy:", clf.score(x_test_vectorized, y_test))
y_pred = clf.predict(x_test_vectorized)
print('F1 score:', f1_score(y_test, y_pred, average=None, labels=['pos', 'neg']))

Parameter tuning and exploring other models