In [52]:
import pandas as pd
import matplotlib.pyplot as plt

In [63]:
train = pd.read_csv('train_2kmZucJ.csv')
test = pd.read_csv('test_oJQbWVk.csv')
submission = pd.read_csv('sample_submission_LnhVWA4.csv')

In [64]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [87]:
train['tweet'][4]

"What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!"

### Making checkpoints 

In [65]:
df_train = train.copy()
df_test = test.copy()

In [66]:
df_train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [67]:
df_train.drop(columns='id',axis=1,inplace=True)
df_test.drop(columns='id',axis=1,inplace=True)

In [68]:
df_train.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,1,What amazing service! Apple won't even talk to...


###  Data Preprocessing with Spacy

In [8]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [11]:
nlp = spacy.load('en_core_web_sm')

In [12]:
stopwords = list(STOP_WORDS)

In [14]:
len(stopwords)

326

### getting lemma and stop words

In [16]:
docx = nlp("This is how John Walker was walking. He was also running beside the lawn.")

In [17]:
# lemmatizing of tokes
for word in docx:
    print(word.text, "Lemma =>" , word.lemma_)

This Lemma => this
is Lemma => be
how Lemma => how
John Lemma => John
Walker Lemma => Walker
was Lemma => be
walking Lemma => walk
. Lemma => .
He Lemma => -PRON-
was Lemma => be
also Lemma => also
running Lemma => run
beside Lemma => beside
the Lemma => the
lawn Lemma => lawn
. Lemma => .


In [18]:
# lemma that are not pronouns
for word in docx:
    if word.lemma_ != "-PRON-":
        print(word.lemma_.lower().strip())

this
be
how
john
walker
be
walk
.
be
also
run
beside
the
lawn
.


In [19]:
# list comprehension of our lemma
[word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in docx]

['this',
 'be',
 'how',
 'john',
 'walker',
 'be',
 'walk',
 '.',
 'he',
 'be',
 'also',
 'run',
 'beside',
 'the',
 'lawn',
 '.']

In [21]:
# filtering out stopwords and punctuations
for word in docx:
    if word.is_stop == False and not word.is_punct:
        print(word)

John
Walker
walking
running
lawn


In [23]:
# use the pucntuations of string module
import string
punctuations = string.punctuation

In [24]:
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [27]:
# creating a Spacy Parser
from spacy.lang.en import English
parser = English()

In [37]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [40]:
h = "this is th e@himansu tripathi and @#love #you"

In [41]:
print(spacy_tokenizer(h))

['th', 'e@himansu', 'tripathi', '@#love']


In [42]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [43]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [44]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, 
                             ngram_range=(1,1)) 
classifier = LinearSVC()

In [45]:
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [46]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

In [69]:
df_train.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,1,What amazing service! Apple won't even talk to...


In [72]:
X = df_train['tweet']
ylabels = df_train['label']

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [74]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [75]:
# Fit our data
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x000001B2497A3E48>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [76]:
# Predicting with a test dataset
sample_prediction = pipe.predict(X_test)

In [80]:
print("Accuracy: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,sample_prediction))

Accuracy:  0.8598484848484849
Accuracy:  1.0


In [81]:
# Accuracy
print("Accuracy: ",pipe.score(X_train,y_train))

Accuracy:  0.9966856060606061


In [111]:
from sklearn.metrics import f1_score

In [92]:
pipe.predict(["What amazing service!",
              " Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!",
             "this is really sex palce"])

array([0, 1, 1], dtype=int64)

In [96]:
pred = pipe.predict(df_test['tweet'])

In [99]:
len(pred), len(submission['label'])

(1953, 1953)

In [100]:
pred

array([1, 1, 1, ..., 0, 1, 0], dtype=int64)

In [102]:
ids = test['id']

In [103]:
sub = pd.DataFrame({
    'id':ids,
    'label':pred
})

In [108]:
sub.label.value_counts()

0    1441
1     512
Name: label, dtype: int64

In [109]:
sub.to_csv('submission.csv',index=False)

In [110]:
d = pd.read_csv('submission.csv')
d.head()

Unnamed: 0,id,label
0,7921,1
1,7922,1
2,7923,1
3,7924,1
4,7925,1
