In [17]:
import pandas as pd
import numpy as np

In [18]:
data = pd.read_csv("train.csv")

In [27]:
data.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

### Remove row where we don't have data

In [28]:
data = data[~data['text'].isna()][['text','label']]

In [29]:
data.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20761 entries, 0 to 20799
Data columns (total 2 columns):
text     20761 non-null object
label    20761 non-null int64
dtypes: int64(1), object(1)
memory usage: 486.6+ KB


### Remove punctuation

In [32]:
import re
def remove_punctuation(text):
    text = re.sub("[^a-zA-Z]", " ", text) # removing punctuation
    return text

In [33]:
data['text'] = data['text'].apply(remove_punctuation)

In [34]:
text = data['text']
label = data['label']

In [35]:
text

0        House Dem Aide  We Didn t Even See Comey s Let...
1        Ever get the feeling your life circles the rou...
2        Why the Truth Might Get You Fired October     ...
3        Videos    Civilians Killed In Single US Airstr...
4        Print  An Iranian woman has been sentenced to ...
                               ...                        
20795    Rapper T  I  unloaded on black celebrities who...
20796    When the Green Bay Packers lost to the Washing...
20797    The Macy s of today grew from the union of sev...
20798    NATO  Russia To Hold Parallel Exercises In Bal...
20799      David Swanson is an author  activist  journa...
Name: text, Length: 20761, dtype: object

### Word Embeddings

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [37]:
corpus = list(text)
tfidf = TfidfVectorizer(max_features = 6000) 
tfidf.fit(corpus)
tfidf_features = tfidf.transform(corpus)

In [38]:
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support
# Converting the labels from strings to binary
le = LabelEncoder()
le.fit(label)
label = le.transform(label)

In [39]:
# Taking 70/30 train test split
train_percent = 0.7
train_cutoff = int(np.floor(train_percent*len(text) ) )

# TF-IDF table
tfidf_model = LinearSVC()
tfidf_model.fit(tfidf_features[0 : train_cutoff], 
                  label[0 : train_cutoff])


In [74]:
test = pd.read_csv("test.csv")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
id        5200 non-null int64
title     5078 non-null object
author    4697 non-null object
text      5193 non-null object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [76]:
#test = test[~test['text'].isna()][['text','id']]
test['text'] = test['text'].fillna(' ')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
id        5200 non-null int64
title     5078 non-null object
author    4697 non-null object
text      5200 non-null object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [77]:
test['text'] = test['text'].apply(remove_punctuation)

In [78]:
test_text = test['text']
id_= test['id']

In [79]:
corpus_test = list(test_text)
tfidf.fit(corpus_test)
tfidf_features_test = tfidf.transform(corpus_test)

In [82]:
tfidf_prediction = tfidf_model.predict(
                  tfidf_features_test)

### Actual Test Prediction

In [83]:
len(tfidf_prediction)

5200

In [63]:
# TF-IDF table
tfidf_model = LinearSVC()
tfidf_model.fit(tfidf_features[0 : train_cutoff], 
                  label[0 : train_cutoff])
tfidf_prediction = tfidf_model.predict(
                  tfidf_features[train_cutoff + 1 : len(text)])

In [64]:
results = pd.DataFrame(index = ['TF-IDF'], 
          columns = ['Precision', 'Recall', 'F1 score', 'support']
          )
results.loc['TF-IDF'] = precision_recall_fscore_support(
          label[train_cutoff + 1 : len(text)], 
          tfidf_prediction, 
          average = 'binary'
          )

### Result for the test 30 percent split

In [65]:
results

Unnamed: 0,Precision,Recall,F1 score,support
TF-IDF,0.954955,0.952809,0.953881,


### Submission 

In [68]:
sub = pd.read_csv('submit.csv')

In [71]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 2 columns):
id       5200 non-null int64
label    5200 non-null int64
dtypes: int64(2)
memory usage: 81.4 KB


In [72]:
sub_new = pd.DataFrame()

In [84]:
sub_new['id']=id_
sub_new['label']=tfidf_prediction

### Kaggle Score 0.59 TFIDF

tfidf_prediction

In [85]:
sub_new

Unnamed: 0,id,label
0,20800,1
1,20801,1
2,20802,1
3,20803,1
4,20804,1
...,...,...
5195,25995,0
5196,25996,0
5197,25997,0
5198,25998,1


In [87]:
sub_new.to_csv("Submit_new.csv",index=False)

### Count Vectorizer

In [95]:
len(test_text)

5200

In [102]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
corpus_vect = list(test_text)
count_vect.fit(corpus_vect)
tfidf_features_vect = tfidf.transform(corpus_vect)


In [90]:
tfidf_model.fit(tfidf_features_vect[0 : train_cutoff], 
                  label[0 : train_cutoff])
tfidf_prediction_vect = tfidf_model.predict(
                  tfidf_features_vect[train_cutoff + 1 : len(text)])

In [103]:
tfidf_prediction_vect = tfidf_model.predict(
                  tfidf_features_vect)

In [104]:
len(tfidf_prediction_vect)

5200

In [105]:
sub_new = pd.DataFrame()

In [106]:
sub_new['id']=id_
sub_new['label']=tfidf_prediction_vect

In [107]:
sub_new.to_csv("Submit_new_1.csv",index=False)

### 0.95 Score on Kaggle Count Vectorizer

tfidf_prediction_vect