In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('./TextFiles/smsspamcollection.tsv',sep='\t')

In [4]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
len(df)

5572

In [7]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [8]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

X = df[['length','punct']]

y = df['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [12]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((3900, 2), (1672, 2), (3900,), (1672,))

In [13]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver='lbfgs')

lr_model.fit(X_train,y_train)

In [17]:
from sklearn import metrics

predn = lr_model.predict(X_test)
predn

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [18]:
print(metrics.confusion_matrix(y_test,predn))

[[1404   44]
 [ 219    5]]


In [19]:
print(metrics.classification_report(y_test,predn))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

    accuracy                           0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [20]:
print(metrics.accuracy_score(y_test,predn))

0.8427033492822966


In [21]:
import numpy as np
import pandas as pd

df = pd.read_csv('./TextFiles/smsspamcollection.tsv',sep='\t')

df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [22]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [24]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [38]:
from sklearn.model_selection import train_test_split

X = df['message']
y = df['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

In [27]:
X_train_counts =  count_vect.fit_transform(X_train)

In [29]:
X_train_counts.shape

(3900, 7263)

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [31]:
X_train_tfidf.shape

(3900, 7263)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)

In [33]:
from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X_train_tfidf,y_train)



In [41]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [42]:
text_clf.fit(X_train, y_train)



In [43]:
predn = text_clf.predict(X_test)

In [45]:
print(metrics.classification_report(y_test,predn))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

    accuracy                           0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672



In [46]:
print(metrics.confusion_matrix(y_test,predn))

[[1445    3]
 [  10  214]]


In [47]:
text_clf.predict(['Hi, how are you'])

array(['ham'], dtype=object)

In [48]:
import numpy as np
import pandas as pd

In [49]:
df = pd.read_csv('./TextFiles/moviereviews.tsv',sep='\t')
df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [52]:
df['review'][0]

'how do films like mouse hunt get into theatres ? \r\nisn\'t there a law or something ? \r\nthis diabolical load of claptrap from steven speilberg\'s dreamworks studio is hollywood family fare at its deadly worst . \r\nmouse hunt takes the bare threads of a plot and tries to prop it up with overacting and flat-out stupid slapstick that makes comedies like jingle all the way look decent by comparison . \r\nwriter adam rifkin and director gore verbinski are the names chiefly responsible for this swill . \r\nthe plot , for what its worth , concerns two brothers ( nathan lane and an appalling lee evens ) who inherit a poorly run string factory and a seemingly worthless house from their eccentric father . \r\ndeciding to check out the long-abandoned house , they soon learn that it\'s worth a fortune and set about selling it in auction to the highest bidder . \r\nbut battling them at every turn is a very smart mouse , happy with his run-down little abode and wanting it to stay that way . \r\

In [53]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [54]:
df.dropna(inplace=True)

In [55]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [57]:
blanks = []

for index,label,review in df.itertuples():
    if review.isspace():
        blanks.append(index)   

In [58]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [59]:
df.drop(blanks,inplace=True)

df.shape

(1938, 2)

In [60]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [61]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.svm import LinearSVC

In [62]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

text_clf.fit(X_train,y_train)



In [63]:
predn = text_clf.predict(X_test)

In [66]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print(classification_report(y_test,predn))
print(confusion_matrix(y_test,predn))

              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

    accuracy                           0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582

[[235  47]
 [ 41 259]]


In [67]:
print(accuracy_score(y_test,predn))

0.8487972508591065
