In [1]:
#Importing useful Libraries
import numpy as np
import pandas as pd

In [2]:
#Loading test and train Data
train=pd.read_csv(r"C:\Users\conne\Documents\Self Study\Datasets\train.csv")
test=pd.read_csv(r"C:\Users\conne\Documents\Self Study\Datasets\test.csv")

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Cleaning Trainind Data

In [5]:
#Checking for Null Values
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
train['location'].nunique()

3341

In [7]:
#Drop the Location Column as It is not much useful in the analysis
train.drop(['location'],axis=1,inplace=True)
train.head()

Unnamed: 0,id,keyword,text,target
0,1,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,Forest fire near La Ronge Sask. Canada,1
2,5,,All residents asked to 'shelter in place' are ...,1
3,6,,"13,000 people receive #wildfires evacuation or...",1
4,7,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
train.isnull().sum()

id          0
keyword    61
text        0
target      0
dtype: int64

In [9]:
#Drop nan values from Keyword column
train.dropna(inplace=True)

In [10]:
train.isnull().sum()

id         0
keyword    0
text       0
target     0
dtype: int64

In [11]:
train['target'].value_counts()

0    4323
1    3229
Name: target, dtype: int64

In [12]:
train['keyword'].nunique()

221

# Removing special character and convert all to lower case

In [25]:
import re
train['text']=[re.sub(r'[^a-z]+',' ', i.lower()).strip() for i in train['text']]
train.head()

Unnamed: 0,id,keyword,text,target
31,48,ablaze,bbcmtd wholesale markets ablaze http t co lhyx...,1
32,49,ablaze,we always try to bring the heavy metal rt http...,0
33,50,ablaze,africanbaze breaking news nigeria flag set abl...,1
34,52,ablaze,crying out for more set me ablaze,0
35,53,ablaze,on plus side look at the sky last night it was...,0


In [26]:
train['text'][45]

'i gained followers in the last week you know your stats and grow with http t co tiyulif c'

# Defining X_train and y_train

In [27]:
X=train['text']
y=train['target']

In [28]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(stop_words = 'english', max_features = 1000, ngram_range = (3,3))

# X_train_counts = cv.fit_transform(X_train)
# X_train_counts.shape

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 43)

# Logistic Regression with CountVectorizer

In [30]:
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([('cv', CountVectorizer(stop_words = 'english', max_features = 1000, ngram_range = (3,3))),
                     ('LReg', LogisticRegression()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(3, 3), preprocessor=None, stop_words='english',
        st...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [31]:
# Form a prediction set
predictions = text_clf.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1422   15]
 [ 833  223]]


In [32]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.63      0.99      0.77      1437
           1       0.94      0.21      0.34      1056

   micro avg       0.66      0.66      0.66      2493
   macro avg       0.78      0.60      0.56      2493
weighted avg       0.76      0.66      0.59      2493



In [33]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.6598475732049739


# Spliting data into test and train

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 43)

# Logistic Regression Model with TfidfVectorizer

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('LReg', LogisticRegression()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

## Test the classifier and display results



In [35]:
# Form a prediction set
predictions = text_clf.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1248  189]
 [ 310  746]]


In [36]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1437
           1       0.80      0.71      0.75      1056

   micro avg       0.80      0.80      0.80      2493
   macro avg       0.80      0.79      0.79      2493
weighted avg       0.80      0.80      0.80      2493



In [37]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7998395507420778


# Linear SVC Model

In [40]:
#from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [41]:
# Form a prediction set
predictions = text_clf.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1189  248]
 [ 272  784]]


In [42]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82      1437
           1       0.76      0.74      0.75      1056

   micro avg       0.79      0.79      0.79      2493
   macro avg       0.79      0.78      0.79      2493
weighted avg       0.79      0.79      0.79      2493



In [43]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7914159647011633
