# Tuning Count Vectorization - One Hot Encoding and other Features

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
plots_df = pd.read_csv("movie_plots.csv")

# filter only for American movies
plots_df = plots_df[plots_df["Origin/Ethnicity"] == "American"]
plots_df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
831,1929,Our Modern Maidens,American,Jack Conway,"Joan Crawford, Douglas Fairbanks Jr.",drama,https://en.wikipedia.org/wiki/Our_Modern_Maidens,"Heiress Billie Brown (Crawford), is engaged to..."
832,1929,The Pagan,American,W.S. Van Dyke,"Ramon Novarro, Renee Adoree, Donald Crisp",romance,https://en.wikipedia.org/wiki/The_Pagan,Trader Henry Slater (Donald Crisp) stops at a ...
833,1929,Paris,American,Clarence G. Badger,"Irene Bordoni, Jack Buchanan",musical comedy,https://en.wikipedia.org/wiki/Paris_(1929_film),"Irène Bordoni is cast as Vivienne Rolland, a P..."
834,1929,Queen Kelly,American,Erich von Stroheim,"Gloria Swanson, Walter Byron",drama,https://en.wikipedia.org/wiki/Queen_Kelly,Prince Wolfram (Byron) is the betrothed of mad...


In [5]:
vectorizers=[CountVectorizer, TfidfVectorizer]
for vectorizer in vectorizers:
    # see below
    print(vectorizer)

<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.TfidfVectorizer'>


In [2]:
 # traditional CountVectorizer
vectorizer = CountVectorizer()

# # use English stopwords, and use one-hot encoding
#vectorizer = CountVectorizer(stop_words="english", binary=True)

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
#vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=0.05) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
#vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=2, max_features=200) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
#vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=2, max_features=200) 

X = vectorizer.fit_transform(plots_df["Plot"])

vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {vectorized_df.shape}")
print(f"Total number of occurences: {vectorized_df.sum().sum()}")
#print(f"Word counts: {vectorized_df.sum()}")
vectorized_df.head()

Shape of dataframe is (836, 14807)
Total number of occurences: 175010


Unnamed: 0,00,000,10,100,1000,11,119,12,13,1373,...,zilah,zinderneuf,zola,zone,zones,zoological,zorro,zulus,álvarez,émile
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
vectorizer = CountVectorizer()

# # use English stopwords, and use one-hot encoding
vectorizer = CountVectorizer(stop_words="english", binary=True)

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
#vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=0.05) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
#vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=2, max_features=200) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
#vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=2, max_features=200) 

X = vectorizer.fit_transform(plots_df["Plot"])

vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {vectorized_df.shape}")
print(f"Total number of occurences: {vectorized_df.sum().sum()}")
#print(f"Word counts: {vectorized_df.sum()}")
vectorized_df.head()

Shape of dataframe is (836, 14526)
Total number of occurences: 69204


Unnamed: 0,00,000,10,100,1000,11,119,12,13,1373,...,zilah,zinderneuf,zola,zone,zones,zoological,zorro,zulus,álvarez,émile
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
vectorizer = CountVectorizer()

# # use English stopwords, and use one-hot encoding
vectorizer = CountVectorizer(stop_words="english", binary=True)

# use English stopwords, and use one-hot encoding, and the word must appear in at least 5% of the movie plots
vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=0.05) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
#vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=2, max_features=200) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
#vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=2, max_features=200) 

X = vectorizer.fit_transform(plots_df["Plot"])

vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {vectorized_df.shape}")
print(f"Total number of occurences: {vectorized_df.sum().sum()}")
#print(f"Word counts: {vectorized_df.sum()}")
vectorized_df.head()

Shape of dataframe is (836, 213)
Total number of occurences: 17776


Unnamed: 0,able,accepts,accidentally,agrees,american,appears,arrested,arrive,arrives,asks,...,william,wins,woman,women,work,world,year,years,york,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
vectorizer = CountVectorizer()

# # use English stopwords, and use one-hot encoding
vectorizer = CountVectorizer(stop_words="english", binary=True)

# use English stopwords, and use one-hot encoding, and the word must appear in at least 5% of the movie plots
vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=0.02) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
vectorizer = CountVectorizer(stop_words="english", min_df=10)#, max_features=200) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
#vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=2, max_features=200) 

X = vectorizer.fit_transform(plots_df["Plot"])

vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {vectorized_df.shape}")
print(f"Total number of occurences: {vectorized_df.sum().sum()}")
#print(f"Word counts: {vectorized_df.sum()}")
vectorized_df.head()

Shape of dataframe is (836, 1485)
Total number of occurences: 52760


Unnamed: 0,000,10,abandoned,abandons,able,aboard,absence,accept,accepts,accident,...,writes,written,wrong,yacht,year,years,york,young,younger,youth
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# can create a number toke (takes all numbers)
# min_df = 0.05 means it must appear in 5% of docs, if number is above 1 (it translates to number of instances)

In [8]:
vectorizer = CountVectorizer()

# # use English stopwords, and use one-hot encoding
vectorizer = CountVectorizer(stop_words="english", binary=True)

# use English stopwords, and use one-hot encoding, and the word must appear in at least 5% of the movie plots
vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=0.02) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
vectorizer = CountVectorizer(stop_words="english", min_df=10)#, max_features=200) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=2, max_features=200) 

X = vectorizer.fit_transform(plots_df["Plot"])

vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {vectorized_df.shape}")
print(f"Total number of occurences: {vectorized_df.sum().sum()}")
#print(f"Word counts: {vectorized_df.sum()}")
vectorized_df.head()

Shape of dataframe is (836, 200)
Total number of occurences: 17224


Unnamed: 0,able,accepts,accidentally,agrees,american,appears,arrested,arrives,asks,attempt,...,wife,william,wins,woman,women,work,world,years,york,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
vectorizer = CountVectorizer()

# # use English stopwords, and use one-hot encoding
vectorizer = CountVectorizer(stop_words="english", binary=True)

# use English stopwords, and use one-hot encoding, and the word must appear in at least 5% of the movie plots
vectorizer = CountVectorizer(stop_words="english", binary=True, min_df=0.02) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
vectorizer = CountVectorizer(stop_words="english", min_df=10)#, max_features=200) 

# use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
# and keep only the top 200
vectorizer = CountVectorizer(ngram_range=(2,2),stop_words="english", min_df=2, max_features=200) 

X = vectorizer.fit_transform(plots_df["Plot"])

vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {vectorized_df.shape}")
print(f"Total number of occurences: {vectorized_df.sum().sum()}")
#print(f"Word counts: {vectorized_df.sum()}")
vectorized_df.head()

Shape of dataframe is (836, 200)
Total number of occurences: 2255


Unnamed: 0,10 000,aboard ship,abraham lincoln,accused murder,agrees marry,alice terry,american civil,asks marry,aunt polly,beautiful young,...,years pass,york city,young boy,young couple,young girl,young lady,young man,young men,young woman,younger brother
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# see n_gram above (2,2) - pair of words

# Cosine Similarity Example

### Intro to Algorithmic Marketing:
![alt text](images/cos-sim-textbook1.png "Logo Title Text 1")


## Finding Magnitude of a Vector

In [1]:
import math
import numpy as np
def magnitude(x): 
    return math.sqrt(sum(i**2 for i in x))

vectorA = [0,3,1,2]

print(f"First approach: {magnitude(vectorA)}")
print(f"Second approach: {np.linalg.norm(vectorA)}")

First approach: 3.7416573867739413
Second approach: 3.7416573867739413


# Pointwise Mutual Information

It's important to identify a **context window** when analyzing co-occurence. In the image below, the context window size is 4 (2 tokens to either side of the target word):

![alt text](images/context_window.png "Logo Title Text 1")

For the purposes of the next section, we'll define the **entire document as the context window.**

Pointwise mutual information measures the ratio between the **joint probability of two events happening** with the probabilities of the two events happening, assuming they are independent. It can be defined with the following equation:

$$
PMI_{A,B} = log\frac{p(A,B)}{p(A)p(B)}
$$

Remember that when two events are independent, $P(i,j) = P(i)P(j)$. Using PMI to just a raw word count is often preferable because very common words have extreme skew ("the" and "of" will co-occur frequently in the same  )

```python
import math
def pmi(tokenA, tokenB, documents, word_counts):
    
    # word_counts[token_A] => number of times tokenA appears in the documents
    # float(len(documents)) => number of documents
    # bigram_freq => a dictionary of the number of times tokenA and tokenB are in the same document together
    
    prob_A = word_counts[tokenA] / float(len(documents))
    prob_B = word_counts[tokenB] / float(len(documents))
    prob_A_B = bigram_freq[" ".join([tokenA, tokenB])] / float(len(documents))
    return math.log(prob_A_B/float(prob_A*prob_B),2) 
```

# Collocation

Many times, in previous homeworks, we've had to manually try to find phrases that belong together. For example, `New York City`.

From [nltk.org](http://www.nltk.org/howto/collocations.html), **collocation** can be defined as

> expressions of multiple words which commonly co-occur together. 

In [None]:
['New','York','City'] ---> ['NEW_YORK_CITY']

In [None]:
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english') + [".",'.', ",",":", "''", "'s", "'", "``", "(", ")", "-"])

In [None]:
documents = []
articles = [f"bbcsport/football/00{i}.txt" for i in range(1,10)]

for article in articles:
    article = open(article) # open each sports article
    for line in article.readlines():
        line = line.replace("\n", "") # replace the new line escape character
        if len(line) > 0: # if the line is not empty, process it
            line = [lemmatizer.lemmatize(token) for token in word_tokenize(line)] 
            documents.append(line)

In [None]:
new_documents = []
for doc in documents:
    new_document = []
    for word in doc:
        if word.strip().lower() not in stopwords:
            new_document.append(word)
    new_documents.append(new_document)

In [None]:
collocation_finder = BigramCollocationFinder.from_documents(new_documents)
measures = BigramAssocMeasures()

collocation_finder.nbest(measures.raw_freq, 15)

In [None]:
# TFIDF (for a term, for a specific document)
# TF = looks within a single document
# IDF = looks at all the documents
# rare words have a very high IDF score
# "the" will have low IDF scores, "McDonald's" will have high IDF scores
# works better for longer documents

# Term Frequency / Inverse Document Frequency


## Term Frequency
![alt text](images/tf-idf1.png "Term Frequency")

## Inverse Document Frequency
![alt text](images/tf-idf2.png "Inverse Document Frequency")

### Example Calculation

![alt text](images/tf-idf4.png "Example")

In [None]:
# so far just unigrams
# but we may want phrases "ngrams" - skinny jeans

## Using Scikit-Learn to Generate TF-IDF

In [10]:
df = pd.read_csv("mcdonalds-yelp-negative-reviews.csv", encoding="latin1")

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

vectorizer = TfidfVectorizer(ngram_range=(3,4),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_df=0.4, stop_words=stopwords.words())

In [12]:
df = pd.read_csv("mcdonalds-yelp-negative-reviews.csv", encoding="latin1")
corpus = list(df["review"].values)

X = vectorizer.fit_transform(corpus)
terms = vectorizer.get_feature_names()
tf_idf = pd.DataFrame(X.toarray().transpose(), index=terms)

  'stop_words.' % sorted(inconsistent))


In [13]:
terms

['aaaaaaaahhhhhhhhhhh still feel',
 'aaaaaaaahhhhhhhhhhh still feel situation',
 'abbreviated menu worthy',
 'abbreviated menu worthy mcdonald',
 'abc kitchen numerous',
 'abc kitchen numerous times',
 'ability answer questions',
 'ability answer questions menu',
 'ability innovate launching',
 'ability innovate launching products',
 'ability specific location',
 'ability specific location produce',
 'able access wifi',
 'able access wifi stopping',
 'able advance join',
 'able advance join main',
 'able buy meal',
 'able buy meal complain',
 'able catch time',
 'able catch time wait',
 'able collect thoughts',
 'able collect thoughts order',
 'able convince coupon',
 'able convince coupon clearly',
 'able convince twins',
 'able convince twins decided',
 'able escape raised',
 'able escape raised drive',
 'able exit onto',
 'able exit onto kostner',
 'able fulfill orders',
 'able fulfill orders cashier',
 'able get correct',
 'able get correct order',
 'able get dozen',
 'able get doz

In [14]:
tf_idf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1515,1516,1517,1518,1519,1520,1521,1522,1523,1524
aaaaaaaahhhhhhhhhhh still feel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaaaaaahhhhhhhhhhh still feel situation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abbreviated menu worthy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abbreviated menu worthy mcdonald,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abc kitchen numerous,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombies bikes stopped stare,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombies little less,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zombies little less predictable,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoom line person,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
tf_idf = tf_idf.sum(axis=1)
score = pd.DataFrame(tf_idf, columns=["score"])
score.sort_values(by="score", ascending=False, inplace=True)

In [16]:
score
# used to summarize the meaning of the documents

Unnamed: 0,score
worst mcdonald ever,2.602793
get order right,2.390093
worst mcdonalds ever,2.342385
went drive thru,1.754710
drive thru window,1.624240
...,...
need let peeps yelper,0.033990
part mcwrap could,0.033990
much stinky yes stinky,0.033990
much stinky yes,0.033990


In [None]:
score.to_csv("scores.csv")

## Exercises

For the following exercises, use the definitions below:

**Term frequency**:
$$
tf = n(t,d)
$$
**Inverse document frequency**:
$$
idf = 1 + \frac{N}{df(t) + 1}
$$

In [None]:
documents = [
    "He ate the food",
    "He liked the meal",
    "She likes the food from McDonalds, but she avoids the food from Burger King",
    "They like to eat 3 meals a day"
]

### Calculate the TF-IDF score for `like` in each of the documents

### Calculate the TF-IDF score for `the food` bigram in each of the documents