In [3]:
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
train = pd.read_csv('train_2kmZucJ.csv')
test = pd.read_csv('test_oJQbWVk.csv')
submission = pd.read_csv('sample_submission_LnhVWA4.csv')

In [5]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [6]:
train['tweet'][4]

"What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!"

### Making checkpoints 

In [7]:
df_train = train.copy()
df_test = test.copy()

In [8]:
df_train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [9]:
df_train.drop(columns='id',axis=1,inplace=True)
df_test.drop(columns='id',axis=1,inplace=True)

In [10]:
df_train.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,1,What amazing service! Apple won't even talk to...


###  Data Preprocessing with Spacy

In [11]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [12]:
nlp = spacy.load('en_core_web_sm')

In [13]:
stopwords = list(STOP_WORDS)

In [14]:
len(stopwords)

326

### getting lemma and stop words

In [20]:
# use the pucntuations of string module
import string
punctuations = string.punctuation

In [21]:
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
# creating a Spacy Parser
from spacy.lang.en import English
parser = English()

In [23]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [24]:
h = "this is th e@himansu tripathi and @#love #you"

In [25]:
print(spacy_tokenizer(h))

['th', 'e@himansu', 'tripathi', '@#love']


In [26]:
!pip install xgboost



In [27]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

In [28]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [29]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, 
                             ngram_range=(1,1)) 
# classifier = LinearSVC()
classifier = XGBClassifier()

In [30]:
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [31]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

In [32]:
df_train.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,1,What amazing service! Apple won't even talk to...


In [33]:
X = df_train['tweet']
ylabels = df_train['label']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [35]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [36]:
# Fit our data
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x000002738868B108>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...ale_pos_weight=1, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None))])

In [37]:
# Predicting with a test dataset
sample_prediction = pipe.predict(X_test)

In [38]:
print("Accuracy: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,sample_prediction))

Accuracy:  0.8768939393939394
Accuracy:  1.0


In [39]:
# Accuracy
print("Accuracy: ",pipe.score(X_train,y_train))

Accuracy:  0.915719696969697


In [40]:
from sklearn.metrics import f1_score

In [41]:
pipe.predict(["What amazing service!",
              " Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!",
             "this is really sex palce"])

array([0, 1, 1], dtype=int64)

In [42]:
pred = pipe.predict(df_test['tweet'])

In [43]:
len(pred), len(submission['label'])

(1953, 1953)

In [44]:
pred

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [45]:
ids = test['id']

In [46]:
sub = pd.DataFrame({
    'id':ids,
    'label':pred
})

In [47]:
sub.label.value_counts()

0    1487
1     466
Name: label, dtype: int64

In [48]:
sub.to_csv('submission.csv',index=False)

In [49]:
d = pd.read_csv('submission.csv')
d.head()

Unnamed: 0,id,label
0,7921,1
1,7922,1
2,7923,1
3,7924,1
4,7925,1
