# Biggie Smalls and 2Pac Rap dataset using the Shahkrokhian implementation 

In [1]:
import pandas as pd
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
import numpy as np

In [2]:
df3 = pd.read_csv("example_spam.csv",delimiter=",",names = ['label','message'])

#Uploading dataset from biggie
biggie_df = pd.read_csv('./unigrams_NB_clasiffiers/biggie_lyrics.csv',usecols=[1],encoding='latin-1',header=None)
biggie_df.columns = ['lyrics']
biggie_df['lyrics'] = biggie_df['lyrics'].str.replace('[^\w\s]','')
biggie_df['lyrics'] = biggie_df['lyrics'].str.lower()

#Uploading dataset from 2pac
pac_df = pd.read_csv('./unigrams_NB_clasiffiers/2pac_lyrics.csv',usecols=[1],encoding='latin-1',header=None)
pac_df.columns = ['lyrics']
pac_df['lyrics'] = pac_df['lyrics'].str.replace('[^\w\s]','')
pac_df['lyrics'] = pac_df['lyrics'].str.lower()

In [3]:
valuesBiggie,values2pac = biggie_df.get_values(),pac_df.get_values()
print("Lyrics for biggie dataset")
print(biggie_df)
print("\nLyrics for 2pac dataset")
print(pac_df)

Lyrics for biggie dataset
                                               lyrics
0   fuck all you hoes\nget a grip motherfucker\nye...
1   as i grab the glock put it to your headpiece\n...
2   i dont wanna live no mo\nsometimes i hear deat...
3   to all the ladies in the place with style and ...
4   nineteenseventy somethin nigga i dont sweat th...
5   another day in the ghetto \none look outside i...
6   live from bedfordstuyverson the livest one\nre...
7   uh uh uh cmon\nhah sicker than your average\np...
8   uhh uhhh\nbig po ppa\nno info for the dea\nfed...
9   uhh its the ten crack commandments\nwhat uhh u...
10  the commission\nuncle paulie big ditti\ncaesar...
11  relax and take notes while i take tokes of the...
12  good evenin ladies and gentlemen\nhows everybo...
13  who shot ya\nseperate the weak from the obsole...
14  when i die fuck it i wanna go to hell\ncause i...
15  when the lala hits ya lyrics just splits ya\nh...

Lyrics for 2pac dataset
                               

#### Changing labels by numbers

In [4]:
biggie_lyrics = biggie_df['lyrics'].values
biggie_lyrics = [song.split('\n') for song in biggie_lyrics]
biggie_lyrics = [line for song in biggie_lyrics for line in song]
pac_lyrics = pac_df['lyrics'].values
pac_lyrics = [song.split('\n') for song in pac_lyrics]
pac_lyrics = [line for song in pac_lyrics for line in song]

rap_lines = []

for line in biggie_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([0,line]))

for line in pac_lyrics:
    if len(line.split()) > 3:
        rap_lines.append(np.array([1,line]))

rap_lines = np.array(rap_lines)

#Getting the dataframe
df = pd.DataFrame(rap_lines)
df.columns = ['label','line']
df.head()
df['label'] = df['label'].replace(['0','1'],[0,1])

values = df.get_values()
print("Values for the rap lines")
print(df.shape)
print(values)

Values for the rap lines
(1970, 2)
[[0 'fuck all you hoes']
 [0 'get a grip motherfucker']
 [0 'yeah this album is dedicated to all the teachers that told me']
 ...
 [1 'let me serenade the streets of la']
 [1 'from oakland to sactown the bay area and back down']
 [1 'cali is where they put their mack down']]


#### convert all characters in the message to lower case

In [5]:
df['line'] = df['line'].str.lower()
print("Lines in lower-case for rap lines")
print(df['line'])

Lines in lower-case for rap lines
0                                       fuck all you hoes
1                                 get a grip motherfucker
2       yeah this album is dedicated to all the teache...
3       id never amount to nothin to all the people th...
4       buildings that i was hustlin in front of that ...
5       me when i was just tryin to make some money to...
6       and all the niggas in the struggle you know wh...
7                        uhha its all good baby baybee uh
8                                      it was all a dream
9                         i used to read word up magazine
10              saltnpepa and heavy d up in the limousine
11                             hangin pictures on my wall
12         every saturday rap attack mr magic marley marl
13                     i let my tape rock til my tape pop
14         smokin weed and bamboo sippin on private stock
15       way back when i had the red and black lumberjack
16                                  wi

#### Tokenizing the messages

First, we have to import and download the tokenizer from the console:
An installation window will appear. Go to the "Models" tab and select "punkt" from the "Identifier" column. Then click "Download" and it will install the necessary files. 

In [6]:
import nltk
#nltk.download()

Now we can apply the tokenization:

In [7]:
df['line'] = df['line'].apply(nltk.word_tokenize)
print("Lines after tokenization for rap lines")
print(df['line'])

Lines after tokenization for rap lines
0                                  [fuck, all, you, hoes]
1                            [get, a, grip, motherfucker]
2       [yeah, this, album, is, dedicated, to, all, th...
3       [id, never, amount, to, nothin, to, all, the, ...
4       [buildings, that, i, was, hustlin, in, front, ...
5       [me, when, i, was, just, tryin, to, make, some...
6       [and, all, the, niggas, in, the, struggle, you...
7                [uhha, its, all, good, baby, baybee, uh]
8                                [it, was, all, a, dream]
9                 [i, used, to, read, word, up, magazine]
10      [saltnpepa, and, heavy, d, up, in, the, limous...
11                       [hangin, pictures, on, my, wall]
12      [every, saturday, rap, attack, mr, magic, marl...
13           [i, let, my, tape, rock, til, my, tape, pop]
14      [smokin, weed, and, bamboo, sippin, on, privat...
15      [way, back, when, i, had, the, red, and, black...
16                            [wi

#### Stemming the messages using the Porter Stemmer algorithm

In [8]:
stemmer = PorterStemmer()
df['line'] = df['line'].apply(lambda x: [stemmer.stem(y) for y in x])
print("Lines after stemming for rap lines")
print(df['line'])

Lines after stemming for rap lines
0                                   [fuck, all, you, hoe]
1                              [get, a, grip, motherfuck]
2       [yeah, thi, album, is, dedic, to, all, the, te...
3       [id, never, amount, to, nothin, to, all, the, ...
4       [build, that, i, wa, hustlin, in, front, of, t...
5       [me, when, i, wa, just, tryin, to, make, some,...
6       [and, all, the, nigga, in, the, struggl, you, ...
7                  [uhha, it, all, good, babi, baybe, uh]
8                                 [it, wa, all, a, dream]
9                   [i, use, to, read, word, up, magazin]
10      [saltnpepa, and, heavi, d, up, in, the, limousin]
11                         [hangin, pictur, on, my, wall]
12      [everi, saturday, rap, attack, mr, magic, marl...
13           [i, let, my, tape, rock, til, my, tape, pop]
14      [smokin, weed, and, bamboo, sippin, on, privat...
15      [way, back, when, i, had, the, red, and, black...
16                            [with, 

#### Transforming data into occurrences

In [9]:
df['line'] = df['line'].apply(lambda x: ' '.join(x))
count_vect = CountVectorizer()

print("Counts and lines for rap lines")
counts = count_vect.fit_transform(df['line'])
#print(count_vect.get_feature_names())
#print(counts.toarray())
print(df['line'])

Counts and lines for rap lines
0                                        fuck all you hoe
1                                   get a grip motherfuck
2       yeah thi album is dedic to all the teacher tha...
3       id never amount to nothin to all the peopl tha...
4       build that i wa hustlin in front of that call ...
5       me when i wa just tryin to make some money to ...
6       and all the nigga in the struggl you know what...
7                          uhha it all good babi baybe uh
8                                       it wa all a dream
9                           i use to read word up magazin
10               saltnpepa and heavi d up in the limousin
11                               hangin pictur on my wall
12         everi saturday rap attack mr magic marley marl
13                     i let my tape rock til my tape pop
14          smokin weed and bamboo sippin on privat stock
15       way back when i had the red and black lumberjack
16                                  with 

#### Using Term Frequency Inverse Document Frequency

In [15]:
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)

print("Counts after transforming for rap lines")
print(counts.toarray())


Counts after transforming for rap lines
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



# Initializing Multinomial Bayes Classifier

## Training the model

In [34]:
model = MultinomialNB()

#### Splitting data into training and test sets 

In [33]:
x_train, x_test, y_train, y_test = train_test_split(counts, df['label'], test_size=.1)
#x_train = counts[:4]
#x_test = counts[4:5]
#y_train = pd.Series(df.get_values()[0:4][:,0])
#y_test = pd.Series(df.get_values()[4:5][:,0])

## Evaluating the model

In [39]:
predicted = model.fit(x_train, y_train)
predicted = model.predict(x_test)

print("Results for rap lines")
print("The model got " + str(100*np.mean(predicted == y_test)) + "% Accuracy")
print("Predicted class: " + str(predicted))
#print("Probabilities for each class: " + str(model.predict_proba(x_test)))

Results for rap lines
The model got 70.05076142131979% Accuracy
Predicted class: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1
 0 0 1 1 0 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 0 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 0 0 1 1 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1
 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0
 0 1 1 1 1 0 1 1 0 0 1 0]


In [41]:
results = []
for _ in range(10):
    x_train, x_test, y_train, y_test = train_test_split(counts, df['label'], test_size=.1)
    predicted = model.fit(x_train, y_train)
    predicted = model.predict(x_test)
    res = (np.mean(predicted == y_test))
    print("The model got " + str(100*res) + "% Accuracy")
    results.append(res)
    
print("Average Accuracy: {:.2f}".format(sum(results)/float(len(results))))

The model got 61.92893401015228% Accuracy
The model got 71.57360406091371% Accuracy
The model got 69.54314720812182% Accuracy
The model got 68.02030456852792% Accuracy
The model got 68.52791878172589% Accuracy
The model got 73.09644670050761% Accuracy
The model got 74.61928934010153% Accuracy
The model got 70.55837563451777% Accuracy
The model got 72.08121827411168% Accuracy
The model got 69.03553299492386% Accuracy
Average Accuracy: 0.70
