# Exploring the Relationship between TV Comedy Scripts and Ratings
## Katherine Schinkel and Marcus Rosti

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from stop_words import get_stop_words

## Data Preparation

In [3]:
friends = pd.read_csv("../data/friendsData.csv")
print(friends.dtypes)
friends_text = pd.read_csv("../data/friendsScripts.csv")
print(friends_text.dtypes)

Season                    int64
Title                    object
imdbID                   object
Episode                   int64
Released                 object
imdbRating              float64
No.overall                int64
Directed by              object
Written by               object
Original air date        object
Prod.code                object
U.S.viewers_millions    float64
ViewersNext             float64
dtype: object
Season      int64
Episode     int64
Text       object
dtype: object


In [4]:
seinfeld_text = pd.read_csv("../data/seinfeldScripts.csv", header=None)
seinfeld_text.columns = ["Season", "Episode", "Text"]
print(seinfeld_text.dtypes)

Season      int64
Episode     int64
Text       object
dtype: object


In [5]:
seinfeld = pd.read_csv("../data/seinfeldData.csv")
print(seinfeld.dtypes)

Season                    int64
Released                 object
Episode                   int64
imdbRating              float64
Title                    object
No.overall               object
No.inSeason               int64
Directed                 object
Written                  object
Prod.Code                object
U.S.viewers_millions    float64
ViewersNext             float64
dtype: object


In [6]:
seinfeldAll = pd.merge(seinfeld, seinfeld_text, on=('Season', 'Episode'))
friendsAll = pd.merge(friends, friends_text, on=('Season', 'Episode'))

In [7]:
#select target value
#target = "ViewersNext"
target = "imdbRating"

In [8]:
#replace missing values
if target == "ViewersNext":
    meanSeinfeld = sum(seinfeldAll['U.S.viewers_millions'])/len(seinfeldAll)
    seinfeldAll['U.S.viewers_millions'] = seinfeldAll['U.S.viewers_millions'].replace(-1, meanSeinfeld)
    seinfeldAll['ViewersNext'] = seinfeldAll['ViewersNext'].replace(-1, meanSeinfeld)

In [9]:
#drop rows where ViewersNext is missing
if target == "ViewersNext":
    seinfeldAll = seinfeldAll.dropna()
    friendsAll = friendsAll.dropna()

In [10]:
cols = ["Text",target]
df1 = seinfeldAll[cols]
df1.columns = ['Text', 'Target']

df2 = friendsAll[cols]
df2.columns = ['Text', 'Target']

df = df1.append(df2, ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
df.columns = ['Text', 'Target']

## Model Preparation

In [11]:
#get stop words
en_stop = get_stop_words('en')

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn import base

class DenseTransformer(base.TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


In [13]:
#Try Stemming
from nltk.stem.snowball import SnowballStemmer
import re

stemmer = SnowballStemmer("english")
analyzer = CountVectorizer(stop_words = en_stop).build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(re.sub(r"\b\d+\b", "", w.strip())) for w in analyzer(doc))

stem_vectorizer = CountVectorizer(analyzer=stemmed_words)

In [14]:
#Instead of Stemming, try lemmatizing and tokenizing
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t.strip()) for t in word_tokenize(doc)]

In [15]:
# create training/testing
doc_train, doc_test = train_test_split(df, test_size=0.3)

In [16]:
mean = sum(doc_train['Target'])/len(doc_train)

In [17]:
rand = np.random.randn(len(doc_test))
meanDF = pd.DataFrame({"mean":mean,"rand":rand})["mean"]

## TFIDF Regression Model

In [18]:
text_clf = Pipeline([
                    #('vect', CountVectorizer(min_df=10, tokenizer=LemmaTokenizer())), #trigrams
                    ('vect', CountVectorizer(analyzer=stemmed_words, min_df=10)), #stemming
                     ('tfidf', TfidfTransformer()),
                     ('to_dense', DenseTransformer()),
                     ('clf', LinearRegression())
])
text_clf.fit(doc_train["Text"], doc_train['Target'])
predicted_value = text_clf.predict(doc_test["Text"])
print(mean_squared_error(doc_test['Target'], predicted_value))
print(mean_squared_error(doc_test['Target'], meanDF))

0.137210818588
0.138231595107


In [19]:
print(len(text_clf.named_steps['clf'].coef_))

2509


In [20]:
#t-test
(mean_squared_error(doc_test['Target'], predicted_value) - mean_squared_error(doc_test['Target'], meanDF)) / (np.std(predicted_value)/np.sqrt(len(predicted_value)))

-0.057927285023036117

In [21]:
#look at results
pd.concat([pd.DataFrame(doc_test['Target'].values), pd.DataFrame(predicted_value)], ignore_index=True, axis=1)

Unnamed: 0,0,1
0,8.5,8.588365
1,8.4,8.449487
2,8.5,8.802441
3,8.3,8.945472
4,8.1,8.707590
5,8.3,8.629688
6,9.0,8.570380
7,8.7,8.501490
8,8.8,8.466440
9,8.5,8.489548


In [23]:
modelmse = []
meanmse = []
kf = KFold(n=len(df), n_folds=10, shuffle=True,random_state=None)
for train_index, test_index in kf:
    X_train, X_test = df.Text[train_index], df.Text[test_index]
    y_train, y_test = df.Target[train_index], df.Target[test_index]
    text_clf = Pipeline([
            #('vect', CountVectorizer(min_df=10, tokenizer=LemmaTokenizer())), #trigrams
            ('vect', CountVectorizer(analyzer=stemmed_words,min_df=20)), #stemming
            ('tfidf', TfidfTransformer()),
            ('to_dense', DenseTransformer()),
            ('clf', LinearRegression())
    ])
    text_clf.fit(X_train, y_train)
    predicted_value = text_clf.predict(X_test)
    mean_kf = sum(y_train)/len(y_train)
    rand_kf = np.random.randn(len(y_test))
    meanDF_kf = pd.DataFrame({"mean":mean_kf,"rand":rand_kf})["mean"]
    meanmse.append(mean_squared_error(y_test, meanDF_kf))
    modelmse.append(mean_squared_error(y_test, predicted_value))
    #print("Model mse: ",mean_squared_error(y_test, predicted_value))
    #print("Mean mse : ",mean_squared_error(y_test, meanDF_kf))
    #print("\n")


In [24]:
print(np.mean(meanmse))
print(np.mean(modelmse))
print((np.mean(meanmse)-np.mean(modelmse))/np.sqrt((np.std(meanmse)**2)/10+(np.std(modelmse)**2)/10))

0.132609319805
0.173900420709
-2.04982337819


# Topic Modeling Linear Regression

In [26]:
#try topic modeling
from sklearn.decomposition import LatentDirichletAllocation

text_clf = Pipeline([
                    #('vect', CountVectorizer(min_df=5, tokenizer=LemmaTokenizer())), #trigrams
                    ('vect', CountVectorizer(analyzer=stemmed_words, min_df=10)), #stemming
                    ('to_dense', DenseTransformer()),
                    ('lda', LatentDirichletAllocation(n_topics=40)),
                    ('clf', LinearRegression())
])
text_clf.fit(doc_train["Text"], doc_train['Target'])
predicted_value = text_clf.predict(doc_test["Text"])
print(mean_squared_error(doc_test['Target'], predicted_value))
print(mean_squared_error(doc_test['Target'], meanDF))

0.138534269016
0.138231595107


In [93]:
# run cross validation
scores = cross_val_score(text_clf, df["Text"], df["Target"], cv=10, scoring="mean_squared_error", n_jobs=-1)
print(scores)
print("Mean Squared Error: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 1.645))

[ -26.96848238  -43.67698889  -39.76237612  -54.42685664  -25.23771934
  -46.00232837 -110.25023773  -44.10487377  -30.80258886  -40.77366654]
Mean Squared Error: -46.20 (+/- 37.86)


In [150]:
modelmse = []
meanmse = []
from sklearn.cross_validation import KFold
kf = KFold(n=len(df), n_folds=10, shuffle=False,random_state=None)
for train_index, test_index in kf:
    X_train, X_test = df.Text[train_index], df.Text[test_index]
    y_train, y_test = df.Target[train_index], df.Target[test_index]
    text_clf = Pipeline([
                    #('vect', CountVectorizer(min_df=5, tokenizer=LemmaTokenizer())), #trigrams
                    ('vect', CountVectorizer(analyzer=stemmed_words, min_df=10)), #stemming
                    ('to_dense', DenseTransformer()),
                    ('lda', LatentDirichletAllocation(n_topics=40)),
                    ('clf', LinearRegression())
    ])
    text_clf.fit(X_train, y_train)
    predicted_value = text_clf.predict(X_test)
    mean_kf = sum(y_train)/len(y_train)
    rand_kf = np.random.randn(len(y_test))
    meanDF_kf = pd.DataFrame({"mean":mean_kf,"rand":rand_kf})["mean"]
    meanmse.append(mean_squared_error(y_test, meanDF_kf))
    modelmse.append(mean_squared_error(y_test, predicted_value))
    #print("Model mse: ",mean_squared_error(y_test, predicted_value))
    #print("Mean mse : ",mean_squared_error(y_test, meanDF_kf))
    #print("\n")

In [151]:
print(np.mean(meanmse))
print(np.mean(modelmse))
print((np.mean(meanmse)-np.mean(modelmse))/np.sqrt((np.std(meanmse)**2)/10+(np.std(modelmse)**2)/10))

0.132985964972
0.132297672078
0.0622007153507


In [132]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

tf_feature_names = text_clf.named_steps['vect'].get_feature_names()
print_top_words(text_clf.named_steps['lda'], tf_feature_names, 20)

Topic #0:
oh just know yeah well like joey uh monica get hey see look go gonna dont want re right im
Topic #1:
yeah re just go like hey know look oh don can think get uh well want realli ll jerri gonna
Topic #2:
label oh super newman bowl ticket yeah well know go just get gift like hey can lane upstair look don
Topic #3:
pig bone maid spare know oh babi just look like don well hey yeah bra squar get sniff suck re
Topic #4:
take kept split upstair beyond particular bow silenc silent solv frantic bean subject spare interview romant estell just wh red
Topic #5:
oh well yeah know don go re hey get right see look just jerri like think can gonna georg uh
Topic #6:
oh know hey go yeah re like well look get can don joey just right one come uh got jerri
Topic #7:
right well know hey just ross like rachel oh can re look tell get think go guy realli yeah monica
Topic #8:
know just like re yeah okay oh go hey think can get don joey look uh one right want back
Topic #9:
know oh yeah go well just re

## LDA example 

In [131]:
import numpy as np
import lda

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english', ngram_range=(1,3))
docs = vectorizer.fit_transform(seinfeldAll.Text)
lda_model = lda.LDA(n_topics=40, n_iter=1500, random_state=1)
lda_model = lda_model.fit(docs)

In [132]:
topic_word = lda_model.topic_word_ 
vocab = vectorizer.get_feature_names()
n_top_words = 15
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: brien lloyd paul jean murphi man wake hot rye game dial alarm gum bastard
Topic 1: leo hello jerri uncl alright check card nana kiss open ask finger write doctor
Topic 2: oh yeah well hey know just re get look got can see think one
Topic 3: funni alright mickey doll david babu shoe beth gail gay man pasta tractor better
Topic 4: show re idea russel noth nbc someth tv kramer charact come butler salsa right
Topic 5: uh alright jerri okay ah em wanna smile hi point indic listen laugh shout
Topic 6: jerri georg elain kramer now want back go come thing start take tri time
Topic 7: morti move poni florida jack cadillac leo marisa coffe neil parent son condo cuban
Topic 8: car park drive van move alright space pull god hit front citi key run
Topic 9: drake toni big salad face handicap alright wheelchair huh ok butter rock newman jane
Topic 10: bubbl test ha laugh hu boy ah sir moop bra babu donald steinbrenn iq
Topic 11: hand door look uh open get take point turn walk coat box sit mo