In [38]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt


df = pd.read_csv('simpsons_dataset.csv')

In [39]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [40]:
df['raw_character_text'].value_counts().head()


Homer Simpson          29782
Marge Simpson          14141
Bart Simpson           13759
Lisa Simpson           11489
C. Montgomery Burns     3162
Name: raw_character_text, dtype: int64

In [50]:
df_new = df.loc[(df['raw_character_text'] == 'Lisa Simpson') | (df['raw_character_text'] == 'Bart Simpson')]
df_new.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [42]:
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer object

text = df_new['spoken_words'].values.astype('U') #Taking the text from the . We need to convert it to Unicode

vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")

There are 14258 words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']


In [43]:
matrix = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
print(matrix[0:1000,0:1000]) #Let's print a little part of the matrix: the first 50 words & documents

  (24, 424)	1
  (29, 868)	1
  (30, 868)	4
  (31, 868)	1
  (38, 942)	1
  (39, 868)	1
  (40, 325)	1
  (45, 266)	1
  (45, 959)	1
  (58, 873)	1
  (62, 624)	1
  (63, 269)	1
  (63, 997)	1
  (74, 356)	1
  (80, 264)	1
  (82, 304)	1
  (89, 838)	1
  (98, 192)	1
  (100, 396)	1
  (103, 896)	1
  (107, 896)	1
  (146, 706)	1
  (146, 829)	1
  (151, 328)	1
  (154, 614)	1
  :	:
  (862, 570)	1
  (862, 774)	1
  (863, 644)	1
  (863, 869)	1
  (864, 869)	1
  (865, 644)	1
  (867, 644)	1
  (869, 644)	1
  (874, 997)	1
  (880, 787)	1
  (919, 838)	1
  (924, 848)	1
  (926, 997)	1
  (928, 997)	1
  (932, 997)	1
  (937, 997)	1
  (940, 920)	1
  (945, 896)	1
  (951, 559)	1
  (962, 304)	1
  (971, 929)	1
  (981, 656)	1
  (986, 80)	1
  (991, 838)	1
  (999, 319)	1


In [53]:
docu_feat = pd.DataFrame(matrix.toarray()) #make a regular matrix, then put in Dataframe
docu_feat.index = df_new['raw_character_text'] #Give the rows names (text of the review)
docu_feat.columns = feature_names #Give the columns names (words from vocabulary)

In [54]:
docu_feat 

Unnamed: 0_level_0,000,007,10,1000,10201,108,1094,11,12,120,...,zork,zorrinid,zuckerberg,zuh,zumanity,zur,zz,zzzapp,ãªtre,ãºna
raw_character_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
