In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer object

In [3]:
simpsons = pd.read_csv("simpsons_dataset.csv")
simpsons.head(5)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [4]:
bart = simpsons.loc[simpsons['raw_character_text'] == 'Bart Simpson']
lisa = simpsons.loc[simpsons['raw_character_text'] == 'Lisa Simpson']

# also possible:
# df = df.loc[(df['raw_character_text'] == 'Lisa Simpson') | (df['raw_character_text'] == 'Bart Simpson')] 

In [5]:
bart_lisa = bart.append(lisa)

In [6]:
bart_lisa["raw_character_text"].value_counts()

Bart Simpson    13759
Lisa Simpson    11489
Name: raw_character_text, dtype: int64

In [7]:
text = bart_lisa['spoken_words'].values.astype('U')

In [8]:
vect = CountVectorizer(stop_words='english').fit(text) # Creating a vector object and fitting it. 
# need to fit before you can use feature_names.
feature_names = vect.get_feature_names()

print(f"There are a lot ({len(feature_names)}) unique words in the vocabulary. some of them:\n\n{feature_names[700:750]}")


There are a lot (14258) unique words in the vocabulary. some of them:

['ashamed', 'ashes', 'ashley', 'ashman', 'aside', 'asimov', 'ask', 'asked', 'asking', 'asks', 'asleep', 'aspen', 'aspiration', 'aspiring', 'ass', 'assange', 'assassination', 'assassins', 'assed', 'assel', 'assemble', 'assembled', 'assembly', 'assemblywoman', 'asses', 'asset', 'assignment', 'assisi', 'assistance', 'assistant', 'assisting', 'associate', 'associates', 'association', 'assuage', 'assume', 'assumed', 'assure', 'assured', 'asterisk', 'asterix', 'asteroid', 'asthma', 'astro', 'astronaut', 'astronomer', 'astronomy', 'asylum', 'ate', 'atheism']


In [11]:
matrix = vect.transform(text)
print(matrix[0:500,0:500])

  (11, 325)	1
  (36, 356)	1
  (40, 264)	1
  (42, 304)	1
  (97, 328)	1
  (102, 325)	1
  (103, 451)	1
  (107, 325)	1
  (129, 461)	1
  (150, 325)	1
  (153, 397)	1
  (172, 404)	1
  (209, 325)	1
  (214, 493)	1
  (215, 163)	1
  (276, 70)	1
  (293, 360)	1
  (314, 304)	1
  (361, 342)	1
  (398, 325)	1
  (413, 19)	1
  (422, 427)	1
  (435, 236)	1
  (449, 329)	1
  (459, 300)	1
  (473, 133)	1
  (490, 343)	1


this is without all zero VALUES, a sparse matrix, which is why it doesnt show many of the first values. Needs a big range, less chance of zero cells. the 1 stands for that the word is appearing ONCE in ONE sentence.

In [48]:
docu_feat = pd.DataFrame(matrix.toarray()) #make a regular matrix, then put in Dataframe
docu_feat.index = bart_lisa['raw_character_text'] #Give the rows names (text of the review)
docu_feat.columns = feature_names #Give the columns names (words from vocabulary)

docu_feat

Unnamed: 0_level_0,000,007,10,1000,10201,108,1094,11,12,120,...,zork,zorrinid,zuckerberg,zuh,zumanity,zur,zz,zzzapp,ãªtre,ãºna
raw_character_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
