In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
column_names= ['target', 'id', 'date','flag' ,'user' ,'text']
df=pd.read_csv('twitter.csv', names =column_names ,encoding= 'ISO-8859-1')
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
df.shape

# ##Pre-processing the data

In [54]:
df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [9]:
df['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

replace 4 in taarget with 1

In [60]:

df.replace({'target':{4:1}},inplace=True)

In [15]:
df['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

stemming is a process of reducing a words into its root word
e.g: actor, acting, actress convert into "act" only cuz they has same meaning and we need only root word

In [7]:
port_stem = PorterStemmer()

In [8]:


def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    return stemmed_content

df['stemmed_content'] = df['text'].apply(stemming)
df.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[switchfoot, http, twitpic, com, zl, awww, bum..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, updat, facebook, text, might, cri, res..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[kenichan, dive, mani, time, ball, manag, save..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[nationwideclass, behav, mad, see]"


In [9]:
print(df["stemmed_content"])

0          [switchfoot, http, twitpic, com, zl, awww, bum...
1          [upset, updat, facebook, text, might, cri, res...
2          [kenichan, dive, mani, time, ball, manag, save...
3                     [whole, bodi, feel, itchi, like, fire]
4                         [nationwideclass, behav, mad, see]
                                 ...                        
1599995                     [woke, school, best, feel, ever]
1599996    [thewdb, com, cool, hear, old, walt, interview...
1599997                   [readi, mojo, makeov, ask, detail]
1599998    [happi, th, birthday, boo, alll, time, tupac, ...
1599999    [happi, charitytuesday, thenspcc, sparkschar, ...
Name: stemmed_content, Length: 1600000, dtype: object


In [12]:
x=df['stemmed_content']
y=df['target']

In [13]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2 , stratify=y, random_state=2)
print(x.shape, x_train.shape, x_test.shape)

(1600000,) (1280000,) (320000,)


converting textual data into numerical data cuz model can't read text
vectorizer will give unique num to every word depending on imp of that word
depends on how many time the word is repeating itself

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert x_train and x_test to lists of strings
x_train = [' '.join(tweet) for tweet in x_train]
x_test = [' '.join(tweet) for tweet in x_test]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform x_train
x_train = vectorizer.fit_transform(x_train)

# Transform x_test (using the same vectorizer fitted on x_train)
x_test = vectorizer.transform(x_test)

In [20]:
bbb
print(x_train)
print(x_test)

  (0, 443066)	0.4484755317023172
  (0, 235045)	0.41996827700291095
  (0, 109306)	0.3753708587402299
  (0, 185193)	0.5277679060576009
  (0, 354543)	0.3588091611460021
  (0, 436713)	0.27259876264838384
  (1, 160636)	1.0
  (2, 288470)	0.16786949597862733
  (2, 132311)	0.2028971570399794
  (2, 150715)	0.18803850583207948
  (2, 178061)	0.1619010109445149
  (2, 409143)	0.15169282335109835
  (2, 266729)	0.24123230668976975
  (2, 443430)	0.3348599670252845
  (2, 77929)	0.31284080750346344
  (2, 433560)	0.3296595898028565
  (2, 406399)	0.32105459490875526
  (2, 129411)	0.29074192727957143
  (2, 407301)	0.18709338684973031
  (2, 124484)	0.1892155960801415
  (2, 109306)	0.4591176413728317
  (3, 172421)	0.37464146922154384
  (3, 411528)	0.27089772444087873
  (3, 388626)	0.3940776331458846
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 390130)	0.22064742191076112
  (1279996, 434014)	0.2718945052332447
  (1279996, 318303)	0.21254698865277746
  (1279996, 237899)	0.2236567560099234
  (1279996, 2910

In [21]:
model=LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

In [24]:
x_train_prediction=model.predict(x_train)
training_accuracy=accuracy_score(y_train,x_train_prediction)
print('training data accuracy', training_accuracy)

training data accuracy 0.8101671875


In [25]:
x_test_prediction= model.predict(x_test)
testing_accuracy= accuracy_score(y_test,x_test_prediction)
print('testing data accuracy', testing_accuracy)

testing data accuracy 0.77800625


In [26]:
import pickle
filename='trained_model.sav'
pickle.dump(model,open(filename,'wb'))