# **Categorizing Customer df_reviews**

## **Import Packages and Data**

In [1]:
# Import packages
import math
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords

In [None]:
# Set style
sns.set_theme(style="whitegrid")

In [None]:
# Import data
df_reviews = pd.read_csv('/Users/kellyshreeve/Desktop/Data-Sets/imdb_reviews.tsv', 
                      sep = '\t', parse_dates=True)

In [None]:
# Data info
print(df_reviews.info())

In [None]:
# Data head
display(df_reviews.sample(15))

### Initial data observations

The dataset contains 47331 observations and 16 columns representing movie df_df_df_reviews and their classification, positive or negative. Runtime_minutes needs to be converted to int64. There are no missing values.

## **Prepare Data**

### Fix data types

In [None]:
# Check values of runtime
display(sorted(df_reviews['runtime_minutes'].unique()))

Missing values for runtime were entered as '\\N', which is causing the data to be mis-typed as object. Changing '\\N' to np.nan will allow the variable to be converted to to float64.

In [None]:
# Replace '\\N' with np.nan
df_reviews['runtime_minutes'] = df_reviews['runtime_minutes'].replace('\\N', np.nan)

# Convert runtime to int64
df_reviews['runtime_minutes'] = df_reviews['runtime_minutes'].astype('float')

# Display info
print(df_reviews.info())

Runtime_minutes is converted to float.

### Check for duplicates

In [None]:
# Check for full duplicates
duplicates_count = df_reviews.duplicated().sum()

print(f'Number of full duplicates: {duplicates_count}')

There are no fully duplicate rows.

In [None]:
# Check for review duplicates
review_duplicates = df_reviews['review'].duplicated().sum()

print(f'Number of duplicated review texts: {review_duplicates}')

There are 91 duplicated df_reviews. Print rows to further inspect the duplication.

In [None]:
# Print duplicated rows
duplicate_review_text = df_reviews['review'].duplicated(keep=False)

display(df_reviews[duplicate_review_text])

The rows are fully duplicated other than the idx index variable. Duplicates will be dropped from the dataset.

In [None]:
# Drop duplicates
reivews = df_reviews.drop_duplicates(subset=['review'], inplace=True)

duplicates_new = df_reviews['review'].duplicated().sum()

print(f'Updated number of duplicates: {duplicates_new}')

Duplicates have been removed from the dataset.

## Missing values

In [None]:
# Replace all '\N' with np.nan
df_reviews = df_reviews.replace('\\N', np.nan)

In [None]:
# Calculate number and percent of missing values by columns
missing = df_reviews.isna().sum().reset_index().rename(columns=
                                                       {'index':'column',
                                                        0:'count'})

missing['percent'] = ((missing['count'] / len(df_reviews)) * 100).round(2)

print('Missing Values:')
display(missing)

End_year is missing over 95% of values, runtime_minutes is missing 1% of valeus, and genres, average_rating, and votes are all missing less than 1% of values. Because of the high number missing in end_year, this variable will be dropped from the analysis. The number missing in other variables is so low that imputation isn't necessary. The rows missing information will be dropped from the dataset.

In [None]:
# Drop rows missing values in runtime, genre, rating, or votes
df_reviews = df_reviews.dropna(subset=['runtime_minutes', 'genres', 'average_rating', 'votes'])

missing_dropped = df_reviews.isna().sum().reset_index().rename(columns=
                                                       {'index':'column',
                                                        0:'count'})

missing_dropped['percent'] = ((missing_dropped['count'] / len(df_reviews)) * 100).round(2)

print('Missing values after dropping missing rows:')
display(missing_dropped)

There are no more missing values, after dropping rows missing in the runtime, genre, rating, and votes subest. End_year will be left out of analyses.

### Prepare data conclusion

Duplicates and missing values have been dropped from the dataset. The data is ready for analysis.

## **Exploratory data analysis**

### Number of unique titles

In [None]:
# Find number of movies reviewed in the dataset
movie_count = df_reviews['primary_title'].nunique()

print(f'Number of unique movie titles: {movie_count}')

### Reviews by media type

In [None]:
# Graph number of movies and review tone by media type
fig, axs = plt.subplots(2, 1, figsize=(8, 8))

# Media frequency plot
type_count = sns.countplot(ax=axs[0], data=df_reviews, y='title_type', color='steelblue')
type_count.set(title='Frequency of Media Type', xlabel='Count', ylabel='Media')

# Review tone by media plot
type_review = sns.countplot(ax=axs[1], data=df_reviews, y='title_type', hue='sp', 
                            hue_order=['pos', 'neg'], palette=('darkseagreen', 'steelblue'))
type_review.set(title='Review Tone by Media Type', xlabel='Count', ylabel='Media')

# Show plots
fig.tight_layout()
plt.show()

Most reviews are for movies. There are only a small number of reviews for other media types such as shorts, tv episodes, videos, tv movies, etc. Movies, shorts, tv episodes, and tv series tend to have more positive than negative reviews. Videos and tvmovies have more negative than positive reviews.

### Movies and reviews by year

In [None]:
# Create supblots
fig, axs = plt.subplots(2, 1, figsize=(16, 8))

# Number of movies by year
ax0 = axs[0]

movies_year = df_reviews.groupby('start_year')['tconst'].nunique()
movies_year = movies_year.reindex(index=np.arange(movies_year.index.min(), 
                                    max(movies_year.index.max(), 2021))).fillna(0)

movies_year.plot(kind='bar', color='steelblue', ax=ax0,
                 title='Number of Movies Over Years', xlabel='Year', 
                 ylabel='Number of Movies', width=0.7)

# Tone of review over time
ax1 = axs[1]

pos_year = df_reviews.groupby(['start_year', 'pos'])['pos'].count().unstack()
pos_year = pos_year.reindex(index=np.arange(pos_year.index.min(), 
                                            max(pos_year.index.max(), 2023)), fill_value=0)

pos_year.plot(kind='bar', ax=ax1, stacked=True, color=['steelblue', 'darkseagreen'],
              legend='reverse', width=0.7)

plt.title('Tone and Reviews per Movie Over Years')
plt.xlabel('Year')
plt.ylabel('Number of Reviews')
ax1.add_artist(ax1.legend(['Neg', 'Pos'], reverse=True))

# Add rolling average reviews per movie over 5 years
axt = ax1.twinx() 

review_total = df_reviews.groupby('start_year')['tconst'].count()
review_total = review_total.reindex(index=np.arange(review_total.index.min(), 
                                                    max(review_total.index.max(), 2023))).fillna(0)
review_movie_year = (review_total / movies_year).fillna(0)

review_movie_year.reset_index(drop=True).rolling(5).mean() \
.plot(kind='line', ax=axt, color='orange', label='Reviews per Movie (avg over 5 years)')

lines, labels = axt.get_legend_handles_labels()
ax1.legend(lines, labels, loc='upper left')

# Display figures
fig.tight_layout()
plt.show()

The number of movies per year generally increases over time until 2006, when we see a sharp decline in number of movies produced per year. There are generally similar numbers of positive and negative reviews per year. The number of reviews per movie tends to increase over time, from about 1 review per movie in the early 1900s to amost 10 reviews per movie in 2010.

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16, 5))

review_movie = df_reviews.groupby('tconst')['review'].count() \
    .value_counts().sort_index()

# Barplot of review per movie frequency
ax0 = axs[0]

review_movie.plot(kind='bar', ax=ax0, title='Barplot of Frequencies of Reviews per Movie',
                  xlabel='Number of Reviews', ylabel='Number of Movies')

# KDE of review per movie frequency
ax1 = axs[1]

review_movie = df_reviews.groupby('tconst')['review'].count()

sns.kdeplot(data=review_movie, ax=ax1)

ax1.set_title('KDE of Frequencies of Reviews per Movie')
ax1.set_xlabel('Number of Reviews')
ax1.set_ylabel('Percent')

# Display
fig.tight_layout()
plt.show()

Most movies tend to recieve betweeen 1 - 5 reviews per movie. The number of movies tends to decrease as the number of reviews increase, though there is a spike in a number of movies that have recieved 30 reviews.

### Examine train and test set

In [None]:
# Plot positive and negative review in train and test
ds_group_tone = sns.countplot(data=df_reviews, x='ds_part', hue='sp', hue_order=['pos', 'neg'],
              palette=('darkseagreen', 'steelblue'))

ds_group_tone.set(title='Review Tone in Train and Test Sets', xlabel='Count', ylabel='Group')

plt.legend(title='Review')
plt.ylim([0, 15000])

plt.show()

There are similar numbers of positive and negative reviews in the training and test sets. Additionally, there are similar numbers of positive reviews and similar numbers of negative reviews across the training and test sets. The classes are mostly balanced, and the trianing and test set are similar to each other.

In [None]:
# Chart review tone by year for train/test
fig, axs = plt.subplots(2, 2, figsize=(16, 8), gridspec_kw=dict(width_ratios=(2, 1), height_ratios=(1, 1)))

# Plot train movies over time
ax0 = axs[0][0]

df_train_year = df_reviews[df_reviews['ds_part'] == 'train'].groupby(['start_year', 'pos'])['pos'].count().unstack()
df_train_year = df_train_year.reindex(index=np.arange(df_train_year.index.min(), max(df_train_year.index.max(), 2020))).fillna(0)

df_train_year.plot(kind='bar', ax=ax0, stacked=True, color=['steelblue', 'darkseagreen'], width=0.7,
                   title='Train: Reviews per Year by Polarity', xlabel='Year', ylabel='Number of Movies')

ax0.legend(['Neg', 'Pos'], reverse=True)

# Train density plot review tone by movie
ax1 = axs[0][1]

tone_train = df_reviews[df_reviews['ds_part'] == 'train'].groupby(['tconst', 'pos'])['pos'].count().unstack()
sns.kdeplot(tone_train[0], color='steelblue', label='negative', ax=ax1)
sns.kdeplot(tone_train[1], color='darkseagreen', label='positive', ax=ax1)

ax1.legend(reverse=True)

ax1.set_title('Train: Polarity of Reviews by Movie')
ax1.set_xlabel('Movie')
ax1.set_ylabel('Percent')

# Test movies over time
ax2 = axs[1][0]

df_test_year = df_reviews[df_reviews['ds_part'] == 'test'].groupby(['start_year', 'pos'])['pos'].count().unstack()
df_test_year = df_test_year.reindex(index=np.arange(df_test_year.index.min(), max(df_test_year.index.max(), 2020))).fillna(0)

df_test_year.plot(kind='bar', ax=ax2, stacked=True, color=['steelblue', 'darkseagreen'], width=0.7,
                   title='Test: Reviews per Year by Polarity', xlabel='Year', ylabel='Number of Movies')

ax2.legend(['Neg', 'Pos'], reverse=True)

# Test review tone by movie
ax3 = axs[1][1]

tone_test = df_reviews[df_reviews['ds_part'] == 'test'].groupby(['tconst', 'pos'])['pos'].count().unstack()
sns.kdeplot(tone_test[0], color='steelblue', label='negative', ax=ax3)
sns.kdeplot(tone_test[1], color='darkseagreen', label='positive', ax=ax3)

ax3.legend(reverse=True)

ax3.set_title('Test: Polarity of Reviews by Movie')
ax3.set_xlabel('Movie')
ax3.set_ylabel('Percent')

# Display
fig.tight_layout()
plt.show()

There are similar distributions of movie reviews by year and by polarity in the training and test sets. The sets are similar and can be used to train and test the model.

### Exploratory analysis conclusion

## **Evaluation Procedure**

Create evaluation routine which can be used for all models in this project.

In [None]:
def evaluate_model(model, features_train, target_train, features_test, target_test):
    '''Displays the F1 score curve, ROC curve, and precision recall curve, and a
    data frame of ROC, APS, Accuracy, and F1 score for training and test sets.
    
    model: fitted classification model
    features_train: features of the training set
    target_train: target for the training set
    features_test: features of the test set
    target_test: target for the test set'''
    eval_stats = {}
    
    fig, axs = plt.subplots(1, 3, figsize=(20, 6))
    
    for type, features, target in (('train', features_train, target_train), ('test', features_test, target_test)):
        
        eval_stats[type] = {}
        
        pred_target = model.predict(features)
        pred_proba = model.predict_proba(features)[:, 1]
        
        # F1 scores
        f1_thresholds = np.arange(0, 1.01, 0.05)
        f1_scores = [metrics.f1_score(target, pred_proba >= threshold) for threshold in f1_thresholds]
        
        # ROC
        fpr, tpr, roc_thresholds = metrics.roc_curve(target, pred_proba)
        roc_auc = metrics.roc_auc_score(target, pred_proba)
        eval_stats[type]['ROC AUC'] = roc_auc
        
        # PRC
        precision, recall, pr_thresholds = metrics.precision_recall_curve(target, pred_proba)
        aps = metrics.average_precision_score(target, pred_proba)
        eval_stats[type]['APS'] = aps
        
        # Plot threshold curves
        if type == 'train':
            color = 'blue'
        else:
            color = 'green'
            
        # F1 score curve
        ax0 = axs[0]
        max_f1_score_idx = np.argmax(f1_scores)
        ax0.plot(f1_thresholds, f1_scores, color=color, 
                label=f'{type}, max={f1_scores[max_f1_score_idx]:2f} @ {f1_thresholds[max_f1_score_idx]:2f}')
        # setting crosses for some thresholds
        for threshold in (0.2, 0.4, 0.5, 0.6, 0.8):
            closest_value_idx = np.argmin(np.abs(f1_thresholds - threshold))
            marker_color = 'orange' if threshold != 0.5 else 'red'
            ax0.plot(f1_thresholds[closest_value_idx], f1_scores[closest_value_idx], 
                    color=marker_color, marker='X', markersize=7)
        ax0.set_xlim([-0.02, 1.02])
        ax0.set_ylim([-0.02, 1.02])
        ax0.set_xlabel('Threshold')
        ax0.set_ylabel('F1')
        ax0.legend(loc='lower center')
        ax0.set_title('F1 Score')
            
        # ROC curve
        ax1 = axs[1]
        ax1.plot(fpr, tpr, color=color, label=f'{type}, ROC AUC={roc_auc:.2f}')
        # setting crosses for some thresholds
        for threshold in (0.2, 0.4, 0.5, 0.6, 0.8):
            closest_value_idx = np.argmin(np.abs(roc_thresholds - threshold))
            marker_color = 'orange' if threshold != 0.5 else 'red'            
            ax1.plot(fpr[closest_value_idx], tpr[closest_value_idx], color=marker_color, 
                    marker='X', markersize=7)
        ax1.plot([0, 1], [0, 1], color='grey', linestyle='--')
        ax1.set_xlim([-0.02, 1.02])
        ax1.set_ylim([-0.02, 1.02])
        ax1.set_xlabel('FPR')
        ax1.set_ylabel('TPR')
        ax1.legend(loc='lower center')
        ax1.set_title('ROC Curve')


        # PRC
        ax2 = axs[2]
        ax2.plot(recall, precision, color=color, label=f'{type}, AP={aps:.2f}')
        for threshold in (0.2, 0.4, 0.5, 0.6, 0.8):
            closest_value_idx = np.argmin(np.abs(pr_thresholds-threshold))
            marker_color = 'orange' if threshold != 0.5 else 'red'            
            ax2.plot(recall[closest_value_idx], precision[closest_value_idx], color=marker_color, 
                    marker='X', markersize=7)
        ax2.set_xlim([-0.02, 1.02])    
        ax2.set_ylim([-0.02, 1.02])
        ax2.set_xlabel('recall')
        ax2.set_ylabel('precision')
        ax2.legend(loc='lower center')
        ax2.set_title(f'PRC') 
        
        eval_stats[type]['Accuracy'] = metrics.accuracy_score(target, pred_target)
        eval_stats[type]['F1'] = metrics.f1_score(target, pred_target)
        
    df_eval_stats = pd.DataFrame(eval_stats)
    df_eval_stats = df_eval_stats.round(2)
    df_eval_stats = df_eval_stats.reindex(index=('Accuracy', 'F1', 'APS', 'ROC AUC'))

    print(df_eval_stats)

    return

## **Normalization**

Remove special characters and nunbers from text and convert to lowercase.

In [None]:
# Define function to normalize text
def normalize_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z']", ' ', text)
    text = text.split()
    text = " ".join(text)
    
    return text

# Apply normalize function to review column
df_reviews['reviews_norm'] = df_reviews['review'].apply(normalize_text)

display(df_reviews[['review', 'reviews_norm']].head())

Reviews are converted to lowercase with special characters removed.

## **Train/Validate/Test Split**

In [None]:
# Define train and test subsets
df_train = df_reviews[df_reviews['ds_part'] == 'train']
df_test = df_reviews[df_reviews['ds_part'] == 'test']

# Split train into train and validate
df_train, df_validate = train_test_split(df_train, test_size=0.3, random_state=123)

# Define train and test targets
targ_train = df_train['pos']
targ_valid = df_validate['pos']
targ_test = df_test['pos']

print(f'Train Shape: {df_train.shape}')
print(f'Validate Shape: {df_validate.shape}')
print(f'Test Shape: {df_test.shape}')

Training and test set are split approximately 50/50 split. The test set has 270 fewer observations than the training set.

## **NLP Classification Models**

Train models to classify the tone of the review.

### Model 0 - Constant Model

In [None]:
# Fit dummy model
dummy = DummyClassifier().fit(df_train['reviews_norm'], targ_train)

# Evalate train and test set
evaluate_model(dummy, df_train['reviews_norm'], targ_train, df_validate['reviews_norm'], targ_valid)

### Model 1 - NLTK, TF-IDF and Logistic Regression

In [None]:
# Lemmatize reviews with nltk lemmatizer
nltk_lemmatizer = WordNetLemmatizer()

def nltk_lemmatize(text):
    nltk_tokens = word_tokenize(text)
    nltk_lemmas = [nltk_lemmatizer.lemmatize(token) for token in nltk_tokens]
    nltk_lemmas = " ".join(nltk_lemmas)
    return nltk_lemmas

df_train['reviews_nltk_lemma'] = df_train['reviews_norm'].apply(nltk_lemmatize)
df_validate['reviews_nltk_lemma'] = df_validate['reviews_norm'].apply(nltk_lemmatize)

display(df_train.head())

Reviews are lemmatized via the nltk lemmatizer.

In [None]:
# Create TF-IDF Features
stop_words = nltk_stopwords.words('english')

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)

tf_idf_train = tfidf_vectorizer.fit_transform(df_train['reviews_nltk_lemma'])
tf_idf_valid = tfidf_vectorizer.transform(df_validate['reviews_nltk_lemma'])

print(f'TF-IDF train matrix size: {tf_idf_train.shape}')
print(f'TF-IDF validate matrix size: {tf_idf_valid.shape}')

TF-IDF matrix is fit and transformed on the train set and the validate set is transformed. Train and validate TF-IDF matrices both have 55134 columns, representing 55134 words.

In [None]:
# 