# Naive bayes Example: 

In [143]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [62]:
training_data = pd.read_csv('https://cdn.upgrad.com/UpGrad/temp/1c5e4a4a-1e9b-4bbb-a693-aa58f9aa4614/example_train.csv')
test_data = pd.read_csv('https://cdn.upgrad.com/UpGrad/temp/b5f44292-cd8f-4e69-a630-d73b807e8fca/example_test.csv')
training_data.head()

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


In [63]:
training_data.shape

(5, 2)

In [64]:
test_data.Class

0    education
Name: Class, dtype: object

In [65]:
from sklearn.feature_extraction.text import CountVectorizer

In [66]:
training_data['Class'] = training_data.Class.map({'education': 1, 'cinema': 0})
training_data.Class = training_data.Class.astype('int')

test_data['Class'] = test_data.Class.map({'education': 1, 'cinema': 0})
test_data.Class = test_data.Class.astype('int')

In [67]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
Document    5 non-null object
Class       5 non-null int64
dtypes: int64(1), object(1)
memory usage: 160.0+ bytes


In [68]:
x_train = training_data.values[:, 0]
y_train = training_data.values[:, 1]

x_test = test_data.values[:, 0]
y_test = test_data.values[:, 1]


In [91]:
print(y_train)

[1 1 1 0 0]


In [69]:
cv = CountVectorizer(stop_words='english')
cv_fit = cv.fit(text_data)

In [70]:
cv_fit.vocabulary_

{'upgrad': 11,
 'great': 5,
 'educational': 2,
 'institution': 7,
 'greatness': 6,
 'depends': 1,
 'ethics': 3,
 'story': 10,
 'sholey': 9,
 'cinema': 0,
 'good': 4,
 'movie': 8}

In [71]:
#====> Compress the train data sparse matrix to compressed sparse row format
x_transformed = cv.transform(x_train)

In [72]:
x_transformed.toarray()

array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0]])

In [73]:
print(x_transformed)

  (0, 2)	1
  (0, 5)	1
  (0, 7)	1
  (0, 11)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
  (2, 6)	1
  (2, 10)	1
  (3, 0)	1
  (3, 5)	1
  (3, 9)	1
  (4, 1)	1
  (4, 4)	2
  (4, 8)	1
  (4, 10)	1


In [74]:
cv.get_feature_names()

['cinema',
 'depends',
 'educational',
 'ethics',
 'good',
 'great',
 'greatness',
 'institution',
 'movie',
 'sholey',
 'story',
 'upgrad']

In [75]:
x_test_transform = cv.transform(x_test)

In [77]:
print(x_test_transform)

  (0, 2)	1
  (0, 4)	1
  (0, 7)	1


In [78]:
from sklearn.naive_bayes import MultinomialNB

In [88]:
y_train

array([1, 1, 1, 0, 0], dtype=object)

In [93]:
m_naive_bayes = MultinomialNB()

In [100]:
y_train = np.asarray(y_train, dtype="|S6")

In [104]:

m_naive_bayes.fit(x_transformed, y_train)
result = m_naive_bayes.predict_proba(x_test_transform)

In [106]:
#====> arry[0] ===> cinema calss
# ====> arry[1] ===> education
result

array([[0.32808399, 0.67191601]])

### 3. Building the Model: Bernoulli Naive Bayes

In [109]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(x_transformed, y_train)
bnb.predict_proba(x_test_transform)

array([[0.2326374, 0.7673626]])

--------

# SMS Spam or Ham

In [111]:
sms_data = pd.read_table('https://cdn.upgrad.com/UpGrad/temp/e181771c-0af6-416e-9570-111f1f272661/SMSSpamCollection', header=None, names=['Class', 'sms'])
sms_data.head()

Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [113]:
len(sms_data)

5572

In [115]:
sms_data.Class.value_counts()

ham     4825
spam     747
Name: Class, dtype: int64

In [116]:
sms_data['Class'] = sms_data.Class.map({'ham': 0, 'spam': 1})
sms_data.head()

Unnamed: 0,Class,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [119]:
sms_train, sms_test, sms_class_train, sms_class_test = train_test_split(sms_data.sms, sms_data.Class, random_state=100)

In [120]:
sms_train.shape

(4179,)

In [124]:
sms_train_transformed = cv.fit(sms_train)

In [126]:
sms_train_transformed.vocabulary_

{'ip': 3492,
 'address': 793,
 'test': 6351,
 'considering': 1868,
 'computer': 1835,
 'isn': 3511,
 'minecraft': 4220,
 'server': 5652,
 'ee': 2388,
 'msg': 4334,
 'na': 4393,
 'poortiyagi': 4966,
 'odalebeku': 4591,
 'hanumanji': 3119,
 'hanuman': 3118,
 'bajarangabali': 1140,
 'maruti': 4107,
 'pavanaputra': 4793,
 'sankatmochan': 5531,
 'ramaduth': 5204,
 'mahaveer': 4056,
 'lt': 3992,
 'gt': 3057,
 'janarige': 3547,
 'ivatte': 3526,
 'kalisidare': 3641,
 'saturday': 5552,
 'olage': 4617,
 'ondu': 4626,
 'good': 2987,
 'news': 4478,
 'keluviri': 3661,
 'maretare': 4093,
 'inde': 3419,
 'dodda': 2247,
 'problum': 5086,
 'nalli': 4401,
 'siguviri': 5769,
 'idu': 3364,
 'matra': 4126,
 'true': 6592,
 'don': 2269,
 'neglet': 4453,
 'anybody': 945,
 'number': 4556,
 'haven': 3147,
 'thought': 6422,
 'tactful': 6255,
 'way': 6939,
 'ask': 1032,
 'alex': 873,
 'wait': 6884,
 'til': 6452,
 'wednesday': 6962,
 'sms': 5867,
 'auction': 1072,
 'brand': 1399,
 'new': 4473,
 'nokia': 4511,
 '72

In [128]:
sms_train_transformed = cv.transform(sms_train)

In [133]:
print(sms_train_transformed)

  (0, 793)	1
  (0, 1835)	1
  (0, 1868)	1
  (0, 3492)	1
  (0, 3511)	1
  (0, 4220)	1
  (0, 5652)	1
  (0, 6351)	1
  (1, 1140)	1
  (1, 2247)	1
  (1, 2269)	1
  (1, 2388)	2
  (1, 2987)	1
  (1, 3057)	2
  (1, 3118)	1
  (1, 3119)	1
  (1, 3364)	1
  (1, 3419)	1
  (1, 3526)	1
  (1, 3547)	1
  (1, 3641)	1
  (1, 3661)	1
  (1, 3992)	2
  (1, 4056)	1
  (1, 4093)	1
  :	:
  (4174, 5338)	1
  (4174, 6137)	1
  (4174, 6320)	1
  (4174, 6491)	1
  (4174, 6730)	1
  (4175, 1552)	1
  (4175, 1789)	1
  (4175, 2027)	1
  (4175, 3008)	1
  (4175, 3820)	1
  (4176, 1661)	1
  (4176, 3032)	1
  (4176, 3567)	1
  (4176, 3625)	1
  (4176, 4238)	1
  (4176, 4303)	1
  (4176, 5246)	1
  (4176, 5614)	1
  (4177, 2360)	1
  (4177, 3581)	1
  (4177, 3940)	1
  (4177, 6249)	1
  (4178, 789)	1
  (4178, 3148)	1
  (4178, 5415)	1


In [136]:
y_train

array([b'1', b'1', b'1', b'0', b'0'], dtype='|S6')

In [153]:
mnb = MultinomialNB()
mnb.fit(sms_train_transformed, sms_class_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [154]:
sms_test_transform = cv.transform(sms_test)
y_predicted_class = mnb.predict(sms_test_transform)
y_predicted_prob = mnb.predict_proba(sms_test_transform)
y_predicted_class

array([0, 0, 1, ..., 0, 1, 0])

In [156]:
metrics.accuracy_score(sms_class_test, y_predicted_class)

0.9849246231155779

In [159]:
cm = metrics.confusion_matrix(y_predicted_class, sms_class_test)

In [164]:
cm

array([[1197,   12],
       [   9,  175]])

In [162]:
TP = cm[1, 1]
FP = cm[0, 1]
TN = cm[0, 0]
FN = cm[1, 0]

In [163]:
#====> sensitivity
sensitivity = TP / (FN + TP)
sensitivity

0.9510869565217391

In [165]:
# =====> Specificity
specificity = TN / (TN + FP)
specificity

0.9900744416873449

In [169]:
# ====> precision
precision = TP / (TP + FP)
print(precision)
print(metrics.precision_score(sms_class_test, y_predicted_class))

0.9358288770053476
0.9510869565217391


In [170]:
metrics.recall_score(sms_class_test, y_predicted_class)

0.9358288770053476

In [171]:
metrics.f1_score(sms_class_test, y_predicted_class)

0.9433962264150944

In [179]:
fpr, tpr, threshould = metrics.roc_curve(sms_class_test, y_predicted_prob[:, 1])

In [180]:
pd.DataFrame({'fpr': fpr, 'threshould': threshould, 'tpr': tpr})

Unnamed: 0,fpr,threshould,tpr
0,0.000000,2.000000e+00,0.000000
1,0.000000,1.000000e+00,0.310160
2,0.000000,1.000000e+00,0.320856
3,0.000000,1.000000e+00,0.342246
4,0.000000,1.000000e+00,0.347594
5,0.000000,1.000000e+00,0.358289
6,0.000000,1.000000e+00,0.427807
7,0.000000,1.000000e+00,0.438503
8,0.000000,1.000000e+00,0.508021
9,0.000000,1.000000e+00,0.518717


In [181]:
from sklearn.naive_bayes import BernoulliNB

In [182]:
bnb = BernoulliNB()

In [183]:
bnb.fit(sms_train_transformed, sms_class_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [185]:
bnb_y_predicted = bnb.predict(sms_test_transform)
bnb_y_predicted

array([0, 0, 1, ..., 0, 1, 0])

In [186]:
bnb_y_pred_prob = bnb.predict_proba(sms_test_transform)

In [187]:
metrics.accuracy_score(sms_class_test, bnb_y_predicted)

0.9741564967695621

In [188]:
bcm = metrics.confusion_matrix(sms_class_test, bnb_y_predicted)
bcm

array([[1205,    1],
       [  35,  152]])

In [190]:
TP = bcm[1, 1]
FP = bcm[0, 1]
TN = bcm[0, 0]
FN = bcm[1, 0]

In [192]:
#====> sensitivity
sensitivity = TP / (FN + TP)
sensitivity

0.8128342245989305

In [191]:
# =====> Specificity
specificity = TN / (TN + FP)
specificity

0.9991708126036484

----------

# IMDB Ratings sentiment analysis:

-------

In [218]:
imdb_train_data = pd.read_csv('https://cdn.upgrad.com/UpGrad/temp/049d2f51-7903-4fc3-a7b4-5ae2f1eb6968/movie_review_train.csv')
imdb_test_data = pd.read_csv('https://cdn.upgrad.com/UpGrad/temp/93a8a3f6-b8c7-4e1a-8ce1-5734df27e875/movie_review_test.csv')
imdb_train_data.head()

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


In [219]:
imdb_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 2 columns):
class    1600 non-null object
text     1600 non-null object
dtypes: object(2)
memory usage: 25.1+ KB


In [220]:
imdb_train_data['class'].unique()

array(['Pos', 'Neg'], dtype=object)

In [221]:
imdb_train_data['class'].value_counts()

Pos    800
Neg    800
Name: class, dtype: int64

In [222]:
imdb_test_data.head()

Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plent...
1,Pos,every now and then a movie comes along from a...
2,Pos,you ve got mail works alot better than it des...
3,Pos,jaws is a rare film that grabs your atte...
4,Pos,moviemaking is a lot like being the general m...


In [223]:
imdb_test_data['class'].value_counts()

Pos    200
Neg    200
Name: class, dtype: int64

In [225]:
def map_class(x):
    return x.map({'Pos': 1, 'Neg': 0})

imdb_train_data['class'] = imdb_train_data['class'].map({'Pos': 1, 'Neg': 0})
imdb_test_data['class'] = imdb_test_data['class'].map({'Pos': 1, 'Neg': 0})

In [226]:
imdb_train_data.head()

Unnamed: 0,class,text
0,1,a common complaint amongst film critics is ...
1,1,whew this film oozes energy the kind of b...
2,1,steven spielberg s amistad which is bas...
3,1,he has spent his entire life in an awful litt...
4,1,being that it is a foreign language film with...


In [228]:
imdb_test_data['class'].unique()

array([1, 0])

In [232]:
imdb_cv = cv.fit(imdb_train_data.text)

In [233]:
imdb_cv.vocabulary_

{'common': 6284,
 'complaint': 6360,
 'film': 11832,
 'critics': 7378,
 'aren': 1810,
 'literate': 18603,
 'scripts': 27918,
 'available': 2319,
 'quiz': 25250,
 'gives': 13262,
 'signs': 28809,
 'hope': 15074,
 'art': 1928,
 'writing': 35521,
 'isn': 16771,
 'dead': 7904,
 'hollywood': 14963,
 'need': 21281,
 'look': 18770,
 'independent': 15988,
 'films': 11851,
 'thoughtful': 32060,
 'content': 6771,
 'paul': 23045,
 'attanasio': 2173,
 'script': 27912,
 'takes': 31486,
 'tepid': 31839,
 'thriller': 32090,
 'scandals': 27629,
 'late': 18017,
 '50s': 313,
 'delivers': 8215,
 'telling': 31765,
 'parable': 22815,
 'emptiness': 10340,
 'post': 24178,
 'war': 34707,
 'american': 1334,
 'dream': 9554,
 'golden': 13449,
 'bubble': 4285,
 'surrounds': 31156,
 'protects': 24849,
 'tv': 33056,
 'networks': 21378,
 'sponsors': 29948,
 'riddled': 26743,
 'symbols': 31370,
 '58': 328,
 'chrysler': 5614,
 'radio': 25314,
 'announcement': 1534,
 'sputnik': 30053,
 'heavy': 14532,
 'handed': 14185,

In [238]:
len(imdb_cv.get_feature_names())

35858

In [239]:
# ====> count vecotorizer ignore the words which appeared only 3% 
# =====> count vercorizer ignore the words which appeared frequently 80% of the documents
cv_with_cond = CountVectorizer(stop_words='english', min_df=.03, max_df=.8)

In [242]:
cv_with_cond.fit(imdb_train_data.text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=0.03,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [243]:
cv_with_cond.vocabulary_

{'common': 264,
 'critics': 323,
 'aren': 78,
 'available': 101,
 'gives': 618,
 'hope': 693,
 'art': 81,
 'writing': 1632,
 'isn': 753,
 'dead': 342,
 'hollywood': 690,
 'need': 970,
 'look': 853,
 'films': 549,
 'content': 287,
 'paul': 1037,
 'script': 1248,
 'takes': 1429,
 'thriller': 1471,
 'late': 805,
 'delivers': 357,
 'telling': 1449,
 'post': 1089,
 'war': 1571,
 'american': 59,
 'dream': 413,
 'tv': 1512,
 'radio': 1145,
 'heavy': 673,
 'direction': 386,
 'robert': 1205,
 'performances': 1043,
 'john': 766,
 'rob': 1204,
 'perfectly': 1041,
 'usually': 1537,
 'quality': 1136,
 'sets': 1272,
 'camera': 192,
 'work': 1618,
 'recent': 1165,
 'century': 213,
 'period': 1044,
 'pieces': 1056,
 'years': 1638,
 'old': 1001,
 'images': 716,
 'true': 1502,
 'era': 464,
 'generation': 605,
 'gone': 625,
 '15': 4,
 'world': 1623,
 'themes': 1461,
 'good': 626,
 'life': 831,
 'family': 508,
 'match': 894,
 'father': 521,
 'fame': 506,
 'audience': 99,
 'appear': 72,
 'familiar': 507,
 

In [244]:
len(cv_with_cond.get_feature_names())

1643

In [245]:
imdb_x_train_transform = cv_with_cond.transform(imdb_train_data.text)
imdb_x_test_transform = cv_with_cond.transform(imdb_test_data.text)

In [273]:
imdb_x_test_transform.count_nonzero()

51663

In [274]:
imdb_bnb = BernoulliNB()

In [276]:
imdb_bnb.fit(imdb_x_train_transform, imdb_train_data['class'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [278]:
imdb_y_predicted = imdb_bnb.predict(imdb_x_test_transform)

In [279]:
metrics.accuracy_score(imdb_test_data['class'], imdb_y_predicted)

0.79

In [284]:
imdb_cm = metrics.confusion_matrix(imdb_test_data['class'], imdb_y_predicted)
imdb_cm

array([[177,  23],
       [ 61, 139]])

In [285]:
FP = imdb_cm[0, 1]
TN = imdb_cm[0, 0]
TP = imdb_cm[1, 1]
FN = imdb_cm[1, 0]

In [286]:
#====> sensitivity
sensitivity = TP / (FN + TP)
sensitivity

0.695

In [287]:
# =====> Specificity
specificity = TN / (TN + FP)
specificity

0.885

In [288]:
metrics.recall_score(imdb_test_data['class'], imdb_y_predicted)

0.695

In [289]:
metrics.precision_score(imdb_test_data['class'], imdb_y_predicted)

0.8580246913580247