In [1]:
import numpy as np
import nltk
import pandas as pd
import string

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import unicodedata

from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('data/final_office.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51792 entries, 0 to 51791
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     51792 non-null  int64  
 1   index          51792 non-null  int64  
 2   season         51792 non-null  int64  
 3   episode        51792 non-null  int64  
 4   episode_name   51792 non-null  object 
 5   director       51792 non-null  object 
 6   writer         51792 non-null  object 
 7   character      51792 non-null  object 
 8   text           51437 non-null  object 
 9   averageRating  51792 non-null  float64
 10  numVotes       51792 non-null  int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 4.3+ MB


In [4]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

def load_data():

    shows = {1: [1, 2, 3, 4, 5, 6],
            2: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22],
            3: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
            4: [1, 3,  5,  7,  9, 10, 11, 12, 13, 14],
            5: [1, 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26],
            6: [1, 2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26],
            7: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
            8: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
            9: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]}
    
    docs=[]
    docs_nostopwords=[]
    ratings=[]
    for k, lst in shows.items():
        for i in lst:
            text1 = df[(df['season'] == k) & (df['episode'] == i)]['text']
            rating1 = df[(df['season'] == k) & (df['episode'] == i)]['averageRating'].mean()
            
            # convert to string from series
            paragraph = text1.str.cat(sep=' ')
            # normalize text
            input_string = remove_accents(paragraph)
            
            ratings.append(rating1)
            docs.append(input_string)
            ratings = [round(num, 1) for num in ratings]

    return docs, ratings

In [5]:
X,y = load_data()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Base Model (mean of average rating) mse

In [7]:
y_mean = np.arange(36)
y_mean=np.full((36,1),8.23)
mean_squared_error(y_test, y_mean)

0.2971222222222224

In [8]:
def vectorize_data(data, trans_vec,train_flag):
    '''
        Transforms array of statements into a matrix using a fitted TfidfVector.
        INPUT:
            - Array of statements
            - Fitted TfidfVector
        OUTPUT:
            - Matrix of floats
    '''
    if train_flag: 
        matrix = trans_vec.fit_transform(data)
    else:
        matrix = trans_vec.transform(data) 
        
    vect_data = matrix.toarray()
    return vect_data

In [9]:
def score_final_model(model, tfidf, X_train, y_train, X_test, y_test):
    '''
        Transforms X_train into a Tfidf matrix. Transforms X_test into matrix using fitted Tfidf class.
        Uses X_train_tfidf matrix and y_train to fit model.
        Scores final model using area under ROC curve.
        INPUT:
            - Model class
            - TfidfVectorizer class
            - Array X_train
            - Array y_train
            - Array X_test
            - Array y_test
        OUTPUT:
            - Float
    '''
    X_train_tfidf = vectorize_data(X_train, tfidf, True)
    X_test_tfidf = vectorize_data(X_test, tfidf, False)
    model.fit(X_train_tfidf, y_train)
    y_predict = model.predict(X_test_tfidf)
    
    return mean_squared_error(y_test, y_predict)

In [10]:
model = RandomForestRegressor()
tfidf = TfidfVectorizer(strip_accents='ascii',ngram_range=(3,3))
model_score = score_final_model(model, tfidf, X_train, y_train, X_test, y_test)
print(model_score)

0.2887655833333339


In [11]:
count_xt = 0
for lst in X_test:
    count_xt += len(lst)
count_yt = len(y_test)


count_xtr = 0
for lst in X_train:
    count_xtr += len(lst)
count_ytr = len(y_train)
    
print ('with stopwords')
print ( 'X-test = ',count_xt )
print ( 'y-test = ',count_yt )
print ( 'X-train = ',count_xtr )
print ( 'y-train = ',count_ytr )



with stopwords
X-test =  582044
y-test =  36
X-train =  2232923
y-train =  141
