In [20]:
import pandas as pd
import numpy as np
import re

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [21]:
train_df = pd.read_csv("train_2kmZucJ.csv")
train_df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [22]:
test_df = pd.read_csv("test_oJQbWVk.csv")
test_df.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [23]:
# Removing words starts-with @
for index, tweet in enumerate(train_df['tweet']):
    r = re.findall("@[\w]*", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    
    train_df['tweet'][index] = tweet

# Removing special characters
train_df['tweet'] = train_df['tweet'].str.replace("[^a-zA-Z#]", " ")
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https goo gl h ...
1,2,0,Finally a transparant silicon case Thanks t...
2,3,0,We love this Would you go #talk #makememorie...
3,4,0,I m wired I know I m George I was made that wa...
4,5,1,What amazing service Apple won t even talk to...


In [24]:
# Removing short words
train_df['tweet'] = train_df['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

train_df['tweet'].head()

0    #fingerprint #Pregnancy Test https MfQV #andro...
1    Finally transparant silicon case Thanks uncle ...
2    love this Would #talk #makememories #unplug #r...
3    wired know George made that #iphone #cute #dav...
4    What amazing service Apple even talk about que...
Name: tweet, dtype: object

# For test data

In [25]:
for index, tweet in enumerate(test_df['tweet']):
    r = re.findall("@[\w]*", tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    
    test_df['tweet'][index] = tweet

test_df['tweet'] = test_df['tweet'].str.replace("[^a-zA-Z#]", " ")
test_df['tweet'] = test_df['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bag-of-words feature matrix
train_bow = bow_vectorizer.fit_transform(train_df['tweet'])
test_bow = bow_vectorizer.fit_transform(test_df['tweet'])

In [27]:
# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train_df['label'], random_state=42, test_size=0.3)

lreg = LogisticRegression()
lreg.fit(xtrain_bow, ytrain) # training the model

prediction = lreg.predict_proba(xvalid_bow) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int) # calculating f1 score



0.797776233495483

In [28]:
test_pred = lreg.predict_proba(test_bow)
test_pred_int = test_pred[:,1] >= 0.3
test_pred_int = test_pred_int.astype(np.int)
test_df['label'] = test_pred_int
test_df.head()

Unnamed: 0,id,tweet,label
0,7921,hate #iphone upgrade download apps #ugh #apple...,1
1,7922,currently shitting fucking pants #apple #iMac ...,1
2,7923,like puts some ROMS iPad that possible wouldn ...,1
3,7924,ipod officially dead lost pictures videos from...,1
4,7925,Been fighting iTunes night only want music paid,1


In [29]:
submission = test_df[['id','label']]
submission.to_csv('submission.csv', index=False) # writing data to a CSV file