# Exploring the Relationship between TV Comedy Scripts and Ratings
## Katherine Schinkel and Marcus Rosti

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from stop_words import get_stop_words

## Data Preparation

In [2]:
friends = pd.read_csv("https://raw.githubusercontent.com/kms6bn/TextMiningSeinfeld/develop/data/friendsData.csv?token=AMpR5IEeFWdSvOulNv2m2YdUOEIxHlkmks5XNAIRwA%3D%3D")
print(friends.dtypes)
friends_text = pd.read_csv("https://raw.githubusercontent.com/kms6bn/TextMiningSeinfeld/develop/data/friendsScripts.csv?token=AMpR5IxA6I4Mwn1wFsAXhThCMpUKz12pks5XNAFIwA%3D%3D")
print(friends_text.dtypes)

Season                    int64
Title                    object
imdbID                   object
Episode                   int64
Released                 object
imdbRating              float64
No.overall                int64
Directed by              object
Written by               object
Original air date        object
Prod.code                object
U.S.viewers_millions    float64
ViewersNext             float64
dtype: object
Season      int64
Episode     int64
Text       object
dtype: object


In [3]:
seinfeld_text = pd.read_csv("https://raw.githubusercontent.com/kms6bn/TextMiningSeinfeld/develop/data/seinfeldScripts.csv?token=AMpR5Ptod3tWU_kh4wfFk8kdMfE__L1Aks5XNAD4wA%3D%3D", header=None)
seinfeld_text.columns = ["Season", "Episode", "Text"]
print(seinfeld_text.dtypes)

Season      int64
Episode     int64
Text       object
dtype: object


In [4]:
seinfeld = pd.read_csv("https://raw.githubusercontent.com/kms6bn/TextMiningSeinfeld/develop/data/seinfeldData.csv?token=AMpR5PU_HlSsWQQZZNVr_ozaTGcMMqMpks5XNADbwA%3D%3D")
print(seinfeld.dtypes)

Season                    int64
Released                 object
Episode                   int64
imdbRating              float64
Title                    object
No.overall               object
No.inSeason               int64
Directed                 object
Written                  object
Prod.Code                object
U.S.viewers_millions    float64
ViewersNext             float64
dtype: object


In [5]:
seinfeldAll = pd.merge(seinfeld, seinfeld_text, on=('Season', 'Episode'))
friendsAll = pd.merge(friends, friends_text, on=('Season', 'Episode'))

In [6]:
#select target value
target = "ViewersNext"
#target = "imdbRating"

In [7]:
#replace missing values
if target == "ViewersNext":
    meanSeinfeld = sum(seinfeldAll['U.S.viewers_millions'])/len(seinfeldAll)
    seinfeldAll['U.S.viewers_millions'] = seinfeldAll['U.S.viewers_millions'].replace(-1, meanSeinfeld)
    seinfeldAll['ViewersNext'] = seinfeldAll['ViewersNext'].replace(-1, meanSeinfeld)

In [8]:
#drop rows where ViewersNext is missing
if target == "ViewersNext":
    seinfeldAll = seinfeldAll.dropna()
    friendsAll = friendsAll.dropna()

In [9]:
cols = ["Text",target]
df1 = seinfeldAll[cols]
df1.columns = ['Text', 'Target']

df2 = friendsAll[cols]
df2.columns = ['Text', 'Target']

df = df1.append(df2, ignore_index=True)
df = df.reindex(np.random.permutation(df.index))

In [10]:
mean = sum(df['Target'])/len(df)

## Model Preparation

In [11]:
#get stop words
en_stop = get_stop_words('en')

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn import base

class DenseTransformer(base.TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


In [13]:
#Try Stemming
from nltk.stem.snowball import SnowballStemmer
import re

stemmer = SnowballStemmer("english")
analyzer = CountVectorizer(stop_words = en_stop, min_df=20).build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(re.sub(r"\b\d+\b", "", w.strip())) for w in analyzer(doc))

stem_vectorizer = CountVectorizer(analyzer=stemmed_words)

In [14]:
#Instead of Stemming, try lemmatizing and tokenizing
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t.strip()) for t in word_tokenize(doc)]

In [15]:
# create training/testing
doc_train, doc_test = train_test_split(df, test_size=0.3)

In [16]:
rand = np.random.randn(len(doc_test))
meanDF = pd.DataFrame({"mean":mean,"rand":rand})["mean"]

## Linear Regression Model

In [17]:
text_clf = Pipeline([
                    #('vect', CountVectorizer(min_df=10, tokenizer=LemmaTokenizer())), #trigrams
                    ('vect', CountVectorizer(analyzer=stemmed_words)), #stemming
                     ('tfidf', TfidfTransformer()),
                     ('to_dense', DenseTransformer()),
                     ('clf', LinearRegression())
])
text_clf.fit(doc_train["Text"], doc_train['Target'])
predicted_value = text_clf.predict(doc_test["Text"])
print(mean_squared_error(doc_test['Target'], predicted_value))
print(mean_squared_error(doc_test['Target'], meanDF))

37.0637471782
39.0206093937


In [18]:
#t-test
(mean_squared_error(doc_test['Target'], predicted_value) - mean_squared_error(doc_test['Target'], meanDF)) / (np.std(predicted_value)/np.sqrt(len(doc_test)))

-6.5960455214009981

In [19]:
#look at results
pd.concat([pd.DataFrame(doc_test['Target'].values), pd.DataFrame(predicted_value)], ignore_index=True, axis=1)

Unnamed: 0,0,1
0,35.900000,30.011930
1,16.300000,27.226133
2,17.900000,24.994204
3,30.200000,31.889172
4,33.200000,31.452530
5,23.000000,28.568966
6,15.200000,25.635617
7,25.200000,26.233482
8,25.460000,23.555957
9,24.800000,28.412720


In [20]:
# run cross validation
scores = cross_val_score(text_clf, df["Text"], df["Target"], cv=10, scoring="mean_squared_error", n_jobs=-1)
print(scores)
print("Mean Squared Error: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 1.645))

[-54.27899509 -42.33736646 -22.63727444 -29.83330997 -39.99210153
 -81.63736173 -28.64244861 -34.81083731 -13.23370343 -27.9901362 ]
Mean Squared Error: -37.54 (+/- 29.90)


In [21]:
# Target Shuffling
for i in range(10):
    dfcopy = pd.concat((df["Text"],df["Target"].reindex(np.random.permutation(df.index))), axis=1)
    
    # create training/testing
    doc_train_copy, doc_test_copy = train_test_split(dfcopy, test_size=0.3)
    
    text_clf_copy = Pipeline([
                    #('vect', CountVectorizer(min_df=10, tokenizer=LemmaTokenizer())), #trigrams
                    ('vect', CountVectorizer(analyzer=stemmed_words)), #stemming
                     ('tfidf', TfidfTransformer()),
                     ('to_dense', DenseTransformer()),
                     ('clf', LinearRegression())
    ])
    text_clf_copy.fit(doc_train_copy["Text"], doc_train_copy['Target'])
    
    scores = cross_val_score(text_clf_copy, dfcopy["Text"], dfcopy["Target"], cv=10, scoring="mean_squared_error",n_jobs=-1)
    print(scores)
    print("Mean Squared Error: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 1.645))

[-136.1918904   -83.5174202   -36.30591511  -42.44081304 -119.54122969
  -54.61863795   -8.55587999  -17.91149187  -25.23821475  -58.98490508]
Mean Squared Error: -58.33 (+/- 66.62)


KeyboardInterrupt: 

In [145]:
print(text_clf.named_steps['vect'].get_feature_names())

[u'', u'10th', u'115th', u'14th', u'15show', u'18a', u'18th', u'19th', u'1st', u'20s', u'20th', u'21st', u'239o', u'23rd', u'242s', u'26th', u'28th', u'29th', u'2g', u'3rd', u'40th', u'42nd', u'44th', u'46th', u'48th', u'4d', u'4i', u'4th', u'50s', u'50th', u'53rd', u'55th', u'5b', u'5th', u'75th', u'77th', u'78th', u'7th', u'81st', u'83rd', u'84th', u'85th', u'86th', u'88th', u'8th', u'9th', u'aa', u'aaa', u'aaaaaaahhh', u'aaaaaah', u'aaaaaahhh', u'aaaaah', u'aaaahh', u'aaaand', u'aaach', u'aaagh', u'aaah', u'aaahh', u'aaahhhgggg', u'aaahhhhh', u'aaalright', u'aaarrrrrgh', u'aaawh', u'aagh', u'aaghh', u'aah', u'aahh', u'aahhhh', u'aahhrgh', u'aalll', u'aawwh', u'aay', u'aaya', u'aback', u'abandon', u'abato', u'abbi', u'abbott', u'abduct', u'abet', u'abid', u'abierta', u'abil', u'abl', u'abnorm', u'aboard', u'abod', u'abolish', u'abort', u'aboud', u'about', u'abraham', u'abrupt', u'abscam', u'absent', u'absolut', u'absorb', u'absorpt', u'absurd', u'abus', u'abut', u'ac', u'academ', u'a

In [90]:
#try topic modeling
from sklearn.decomposition import LatentDirichletAllocation

text_clf = Pipeline([
                    #('vect', CountVectorizer(min_df=5, tokenizer=LemmaTokenizer())), #trigrams
                    ('vect', CountVectorizer(analyzer=stemmed_words)), #stemming
                    ('tfidf', TfidfTransformer()),
                    ('to_dense', DenseTransformer()),
                    ('lda', LatentDirichletAllocation(n_topics=50)),
                    ('clf', LinearRegression())
])
text_clf.fit(doc_train["Text"], doc_train['Target'])
predicted_value = text_clf.predict(doc_test["Text"])
print(mean_squared_error(doc_test['Target'], predicted_value))
print(mean_squared_error(doc_test['Target'], meanDF))

31.9349080012
974.289994558


In [91]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

tf_feature_names = text_clf.named_steps['vect'].get_feature_names()
print_top_words(text_clf.named_steps['lda'], tf_feature_names, 20)

Topic #0:
chain eyelid feisti seashor function deerhunt havent guatamala quirk vowel well homeless matey dentit gatorad ceram laundri bonus veggi terror
Topic #1:
well im know jerri don hampton clickclocken yeeaaah schumann just coy yai law realli now hungov coul exact uneth offici
Topic #2:
gimpi urkel magnet neutral burst eighth mitchel cooki joyrid memphi goodbodi chandler strut enter paul corinn pail disori ssssfkittl shmootsi
Topic #3:
deluis comer wisdom susi reschedul reduct schto howev weavin varoom spastic immin pam goddby whend lorrain distil averi cure stairway
Topic #4:
know oh re can just yeah get go jerri rachel uh well ross don shirt right think hey ok alright
Topic #5:
creak tension lemma hai sizeabl sellfish havana primetim earth charl speach offend cretac tallest remain twit semest tailor therapi mortifi
Topic #6:
west breakup pooll agit liberac toil dormitori salman ladybug furri onstag upstand jeweleri jockstrap entemann generoso metro grabbin silk leningrad
Topic #

In [131]:
import numpy as np
import lda

vectorizer = CountVectorizer(analyzer=stemmed_words, stop_words='english', ngram_range=(1,3))
docs = vectorizer.fit_transform(seinfeldAll.Text)
lda_model = lda.LDA(n_topics=40, n_iter=1500, random_state=1)
lda_model = lda_model.fit(docs)

In [132]:
topic_word = lda_model.topic_word_ 
vocab = vectorizer.get_feature_names()
n_top_words = 15
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: brien lloyd paul jean murphi man wake hot rye game dial alarm gum bastard
Topic 1: leo hello jerri uncl alright check card nana kiss open ask finger write doctor
Topic 2: oh yeah well hey know just re get look got can see think one
Topic 3: funni alright mickey doll david babu shoe beth gail gay man pasta tractor better
Topic 4: show re idea russel noth nbc someth tv kramer charact come butler salsa right
Topic 5: uh alright jerri okay ah em wanna smile hi point indic listen laugh shout
Topic 6: jerri georg elain kramer now want back go come thing start take tri time
Topic 7: morti move poni florida jack cadillac leo marisa coffe neil parent son condo cuban
Topic 8: car park drive van move alright space pull god hit front citi key run
Topic 9: drake toni big salad face handicap alright wheelchair huh ok butter rock newman jane
Topic 10: bubbl test ha laugh hu boy ah sir moop bra babu donald steinbrenn iq
Topic 11: hand door look uh open get take point turn walk coat box sit mo