In [426]:
# Importing necessary Libraries
import pandas as pd 
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection


In [427]:
# Reading the data 
dftest = pd.read_csv("TestDataset.csv")
dftrain = pd.read_csv("TrainDataset.csv")

In [428]:
# It is safe to copy the data in a dataframe
train_data = dftrain
test_data = dftest

In [429]:
dftrain.head()

Unnamed: 0,v1,v2
0,spam,U were outbid by simonwatson5120 on the Shinco...
1,ham,Do you still have the grinder?
2,ham,No. Yes please. Been swimming?
3,ham,No de.am seeing in online shop so that i asked.
4,ham,"Faith makes things possible,Hope makes things ..."


In [430]:
dftest.head()

Unnamed: 0,v2
0,Prabha..i'm soryda..realy..frm heart i'm sory
1,"Jus chillaxin, what up"
2,Ok no prob. Take ur time.
3,Congrats! 2 mobile 3G Videophones R yours. cal...
4,Thankyou so much for the call. I appreciate yo...


In [431]:
# Renaming the columns v1 and v2
dftrain = dftrain.rename(columns={"v1":"labels","v2":"message"})
dftest = dftest.rename(columns={"v2":"message"})

In [432]:
labels = dftrain['labels']

In [433]:
dftrain.shape

(4457, 2)

In [434]:
dftest.shape

(1115, 1)

In [435]:
dftrain.labels.value_counts()

ham     3868
spam     589
Name: labels, dtype: int64

In [436]:
dftrain.drop("labels",axis=1,inplace=True)

In [437]:
dftrain = dftrain.append(dftest)

In [439]:
sms = dftrain.message

In [440]:
sms

0       U were outbid by simonwatson5120 on the Shinco...
1                          Do you still have the grinder?
2                          No. Yes please. Been swimming?
3         No de.am seeing in online shop so that i asked.
4       Faith makes things possible,Hope makes things ...
5                                 Hey u still at the gym?
6                       Where is that one day training:-)
7       Did I forget to tell you ? I want you , I need...
8              I dont thnk its a wrong calling between us
9       December only! Had your mobile 11mths+? You ar...
10      That way transport is less problematic than on...
11      Storming msg: Wen u lift d phne, u say \HELLO\...
12          Awesome, that gonna be soon or later tonight?
13                                         What's ur pin?
14                 , how's things? Just a quick question.
15      UpgrdCentre Orange customer, you may now claim...
16      \Hello-/@drivby-:0quit edrunk sorry iff pthis ...
17            

In [441]:
final_sms = sms.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress')
final_sms = final_sms.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')
final_sms = final_sms.str.replace(r'£|\$', 'money-symbol')
final_sms = final_sms.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone-number')
final_sms = final_sms.str.replace(r'\d+(\.\d+)?', 'number')
final_sms = final_sms.str.replace(r'[^\w\d\s]', ' ')
final_sms = final_sms.str.replace(r'\s+', ' ')
final_sms = final_sms.str.replace(r'^\s+|\s*?$', ' ')
final_sms = final_sms.str.lower()


In [442]:
final_sms

0       u were outbid by simonwatsonnumber on the shin...
1                         do you still have the grinder  
2                           no yes please been swimming  
3        no de am seeing in online shop so that i asked  
4       faith makes things possible hope makes things ...
5                                hey u still at the gym  
6                        where is that one day training  
7       did i forget to tell you i want you i need you...
8             i dont thnk its a wrong calling between us 
9       december only had your mobile numbermths you a...
10      that way transport is less problematic than on...
11      storming msg wen u lift d phne u say hello do ...
12          awesome that gonna be soon or later tonight  
13                                        what s ur pin  
14                   how s things just a quick question  
15      upgrdcentre orange customer you may now claim ...
16       hello drivby numberquit edrunk sorry iff pthi...
17            

In [443]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
final_sms = final_sms.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [444]:
ps = nltk.PorterStemmer()
final_sms = final_sms.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [445]:
final_sms

0       u outbid simonwatsonnumb shinco dvd plyr numbe...
1                                           still grinder
2                                           ye pleas swim
3                                   de see onlin shop ask
4       faith make thing possibl hope make thing work ...
5                                         hey u still gym
6                                           one day train
7       forget tell want need crave love sweet arabian...
8                                 dont thnk wrong call us
9       decemb mobil numbermth entitl updat latest col...
10      way transport less problemat sat night way u w...
11      storm msg wen u lift phne u say hello u knw wt...
12                        awesom gonna soon later tonight
13                                                 ur pin
14                                   thing quick question
15      upgrdcentr orang custom may claim free camera ...
16      hello drivbi numberquit edrunk sorri iff pthi ...
17            

In [446]:
# creating a bag-of-words
all_words = []
for sms in final_sms:
    words = word_tokenize(sms)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)  

In [447]:
# print total number of words
print('Number of words: {}'.format(len(all_words)))

Number of words: 6534


In [448]:
# print 10 most common words
print('10 most common words: {}'.format(all_words.most_common(10)))

10 most common words: [('number', 2739), ('u', 1192), ('call', 672), ('go', 453), ('get', 451), ('ur', 385), ('symbolnumb', 325), ('gt', 318), ('lt', 316), ('åmoney', 303)]


In [449]:
# TFIDF
tfidf_model=TfidfVectorizer()
tfidf_vec=tfidf_model.fit_transform(final_sms)
tfidf_data=pd.DataFrame(tfidf_vec.toarray())
tfidf_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6504,6505,6506,6507,6508,6509,6510,6511,6512,6513
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [450]:
train_data.shape

(4457, 2)

In [451]:
test_data.shape

(1115, 1)

In [452]:
dftest = tfidf_data.iloc[-1115:]
dftrain = tfidf_data.iloc[:4457]

In [453]:
dftrain['labels'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [455]:
X = dftrain.drop('labels',axis=1)
Y = labels

In [456]:
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size=.2, random_state=42)

In [462]:
model = RandomForestClassifier(n_estimators=50,max_depth=12, random_state=101,
                             class_weight='balanced',verbose=1,n_jobs=-1)

In [463]:
model.fit(X_train,y_train)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.8s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=12, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=False, random_state=101,
            verbose=1, warm_start=False)

In [464]:
y_pred_rfc = model.predict(X_test)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished


In [465]:
y_pred_rfc 

array(['ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham',
       'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
       'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham',
 

In [466]:
print("F1 Score :",f1_score(y_pred_rfc,y_test,average = "weighted"))


F1 Score : 0.9835627398704002
