In [1]:
import pandas as pd
import numpy as np
import unicodedata
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import os               # for environ variables in Part 3

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn import linear_model

[nltk_data] Downloading package punkt to /Users/Kenkre/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Kenkre/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('data/final_office.csv')

### Practicing NLP for Season 1, Episode 1

In [3]:
text1=df[(df['season'] == 1) & (df['episode'] == 1)]['text']

In [4]:
paragraph = text1.str.cat(sep=' ')

In [5]:
paragraph

'All right Jim. Your quarterlies look very good. How are things at the library? Oh, I told you. I couldn\'t close it. So... So you\'ve come to the master for guidance? Is this what you\'re saying, grasshopper? Actually, you called me in here, but yeah. All right. Well, let me show you how it\'s done. Yes, I\'d like to speak to your office manager, please. Yes, hello. This is Michael Scott. I am the Regional Manager of Dunder Mifflin Paper Products. Just wanted to talk to you manager-a-manger. All right. Done deal. Thank you very much, sir. You\'re a gentleman and a scholar. Oh, I\'m sorry. OK. I\'m sorry. My mistake. That was a woman I was talking to, so... She had a very low voice. Probably a smoker, so... So that\'s the way it\'s done. I\'ve, uh, I\'ve been at Dunder Mifflin for 12 years, the last four as Regional Manager. If you want to come through here... See we have the entire floor. So this is my kingdom, as far as the eye can see. This is our receptionist, Pam. Pam! Pam-Pam! Pa

#### Normalizing Text

In [6]:
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

input_string = remove_accents(paragraph)

#### Tokenize Text

In [7]:
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

sent_tokens = sent_tokenize(input_string)

sent_tokens

[nltk_data] Downloading package punkt to /Users/Kenkre/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['All right Jim.',
 'Your quarterlies look very good.',
 'How are things at the library?',
 'Oh, I told you.',
 "I couldn't close it.",
 'So...',
 "So you've come to the master for guidance?",
 "Is this what you're saying, grasshopper?",
 'Actually, you called me in here, but yeah.',
 'All right.',
 "Well, let me show you how it's done.",
 "Yes, I'd like to speak to your office manager, please.",
 'Yes, hello.',
 'This is Michael Scott.',
 'I am the Regional Manager of Dunder Mifflin Paper Products.',
 'Just wanted to talk to you manager-a-manger.',
 'All right.',
 'Done deal.',
 'Thank you very much, sir.',
 "You're a gentleman and a scholar.",
 "Oh, I'm sorry.",
 'OK.',
 "I'm sorry.",
 'My mistake.',
 'That was a woman I was talking to, so... She had a very low voice.',
 'Probably a smoker, so...',
 "So that's the way it's done.",
 "I've, uh, I've been at Dunder Mifflin for 12 years, the last four as Regional Manager.",
 'If you want to come through here... See we have the entire flo

In [8]:
from nltk.tokenize import word_tokenize

tokens = [sent for sent in map(word_tokenize, sent_tokens)]

list(enumerate(tokens))

[(0, ['All', 'right', 'Jim', '.']),
 (1, ['Your', 'quarterlies', 'look', 'very', 'good', '.']),
 (2, ['How', 'are', 'things', 'at', 'the', 'library', '?']),
 (3, ['Oh', ',', 'I', 'told', 'you', '.']),
 (4, ['I', 'could', "n't", 'close', 'it', '.']),
 (5, ['So', '...']),
 (6,
  ['So', 'you', "'ve", 'come', 'to', 'the', 'master', 'for', 'guidance', '?']),
 (7, ['Is', 'this', 'what', 'you', "'re", 'saying', ',', 'grasshopper', '?']),
 (8,
  ['Actually',
   ',',
   'you',
   'called',
   'me',
   'in',
   'here',
   ',',
   'but',
   'yeah',
   '.']),
 (9, ['All', 'right', '.']),
 (10,
  ['Well', ',', 'let', 'me', 'show', 'you', 'how', 'it', "'s", 'done', '.']),
 (11,
  ['Yes',
   ',',
   'I',
   "'d",
   'like',
   'to',
   'speak',
   'to',
   'your',
   'office',
   'manager',
   ',',
   'please',
   '.']),
 (12, ['Yes', ',', 'hello', '.']),
 (13, ['This', 'is', 'Michael', 'Scott', '.']),
 (14,
  ['I',
   'am',
   'the',
   'Regional',
   'Manager',
   'of',
   'Dunder',
   'Mifflin',
 

#### Convert text to lower case

In [9]:
import string

tokens_lower = [[word.lower() for word in sent]
                 for sent in tokens]

#### Filtering stopwords (and punctuation)

In [10]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stopwords_ = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Kenkre/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import string

punctuation_ = set(string.punctuation)

def filter_tokens(sent):
    return([w for w in sent if not w in stopwords_ and not w in punctuation_])

tokens_filtered = list(map(filter_tokens, tokens_lower))

#### Stemming and lemmatization

In [12]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

stemmer_porter = PorterStemmer()
tokens_stemporter = [list(map(stemmer_porter.stem, sent)) for sent in tokens_filtered]

stemmer_snowball = SnowballStemmer('english')
tokens_stemsnowball = [list(map(stemmer_snowball.stem, sent)) for sent in tokens_filtered]

### TF-IDF - for my own understanding, creating TFIDF manually, but final will be done using sklearn

In [13]:
df[(df['season'] == 9)]['episode'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22])

In [14]:
# 1 - 1, 2, 3, 4, 5, 6
# 2 - 1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
# 3 - 1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
# 4 - 1, 3,  5,  7,  9, 10, 11, 12, 13, 14
# 5 - 1, 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
# 6 - 1, 2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26
# 7 - 1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
# 8 - 1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
# 9 - 1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22

In [15]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

# def filter_tokens(sent):
#     stopwords_ = set(stopwords.words('english'))
#     punctuation_ = set(string.punctuation)
#     return(w for w in sent if not w in stopwords_ and not w in punctuation_)

def load_data():

    shows = {1: [1, 2, 3, 4, 5, 6],
            2: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22],
            3: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
            4: [1, 3,  5,  7,  9, 10, 11, 12, 13, 14],
            5: [1, 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26],
            6: [1, 2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26],
            7: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
            8: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
            9: [1, 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]}
    
    docs=[]
    ratings=[]
    for k, lst in shows.items():
        for i in lst:
            text1 = df[(df['season'] == k) & (df['episode'] == i)]['text']
            rating1 = df[(df['season'] == k) & (df['episode'] == i)]['averageRating'].mean()
           
            # normalize text
            input_string = remove_accents(paragraph)
            # tokenize text
            
            tokens = word_tokenize(input_string)

            # convert text to lower case
            tokens_lower = [word.lower() for word in tokens]
            # filtering stopwords and punctuations
            stopwords_ = set(stopwords.words('english'))
            punctuation_ = set(string.punctuation)
            tokens_filtered = [word for word in tokens_lower if word not in stopwords_ and word not in punctuation_]
            
            snowball = SnowballStemmer('english')
            docs_snowball = [snowball.stem(word) for word in tokens_filtered]
            docs.append(docs_snowball)
            ratings.append(rating1)
            ratings = [round(num, 1) for num in ratings]

            
    return docs,ratings
            
        

In [16]:
X,y = load_data()

In [17]:
len(X)

177

In [20]:
vocab_set = set()
[[vocab_set.add(token) for token in tokens] for tokens in X]
vocab = list(vocab_set)

In [21]:
vocab_dict = {word: i for i, word in enumerate(vocab)}

In [24]:
#word_counts = np.zeros((len(tokens_filtered), len(vocab)))
word_counts = np.zeros((len(X), len(vocab)))

for doc_id, words in enumerate(X):
    
    for word in words:
        
        word_id = vocab_dict[word]
        word_counts[doc_id][word_id] += 1

In [25]:
df = np.sum(word_counts > 0)

In [26]:
df

103368

In [28]:
tf_norm = word_counts.sum(axis=1)
tf_norm[tf_norm == 0] = 1
tf = word_counts / tf_norm.reshape(len(X), 1)

In [30]:
idf = np.log((len(X) + 1.) / (1. + df)) + 1.
tfidf = tf * idf

In [32]:
tfidf_norm = np.sqrt((tfidf ** 2).sum(axis=1))
tfidf_norm[tfidf_norm == 0] = 1
tfidf_normed = tfidf / tfidf_norm.reshape(len(X), 1)

In [33]:
tfidf_normed.shape

(177, 584)

In [34]:
tfidf_normed

array([[-0.00809431, -0.01618863, -0.00809431, ..., -0.00809431,
        -0.02428294, -0.14569765],
       [-0.00809431, -0.01618863, -0.00809431, ..., -0.00809431,
        -0.02428294, -0.14569765],
       [-0.00809431, -0.01618863, -0.00809431, ..., -0.00809431,
        -0.02428294, -0.14569765],
       ...,
       [-0.00809431, -0.01618863, -0.00809431, ..., -0.00809431,
        -0.02428294, -0.14569765],
       [-0.00809431, -0.01618863, -0.00809431, ..., -0.00809431,
        -0.02428294, -0.14569765],
       [-0.00809431, -0.01618863, -0.00809431, ..., -0.00809431,
        -0.02428294, -0.14569765]])

In [35]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_normed, y, test_size=0.2, random_state=42)

In [36]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [37]:
mean_squared_error(y_test, y_predict)

0.2977488062639371

In [38]:
clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train, y_train)
y_predict =clf.predict(X_test)
mean_squared_error(y_test, y_predict)

0.2980993997619168

In [39]:
count_xt = 0
for lst in X_test:
    count_xt += len(lst)
count_yt = len(y_test)


count_xtr = 0
for lst in X_train:
    count_xtr += len(lst)
count_ytr = len(y_train)
    
print ('without stopwords')
print ( 'X-test = ',count_xt )
print ( 'y-test = ',count_yt )
print ( 'X-train = ',count_xtr )
print ( 'y-train = ',count_ytr )

without stopwords
X-test =  21024
y-test =  36
X-train =  82344
y-train =  141
