In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#download stop words
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#print stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
#Data processing

data = pd.read_csv("twitter_data.csv",encoding = "ISO-8859-1")

In [5]:
#length of data
len(data)

1599999

In [6]:
#shape of data
data.shape

(1599999, 6)

In [7]:
#print first 5 rows sample
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [8]:
#fixing the row headers and columns
column_names = ['target','id','date','flag','user','text']

In [9]:
#importing again
data = pd.read_csv("twitter_data.csv",names=column_names,encoding = "ISO-8859-1")

In [10]:
#read data 5 columns
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [11]:
#shape of data
data.shape

(1600000, 6)

In [12]:
#handle missing values -> 1)Counting missing values in dataset
#2)Handle that
data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [13]:
#checking labels distribution
data['target'].value_counts()
#if not even use some sampling

target
0    800000
4    800000
Name: count, dtype: int64

In [14]:
#convert the target from 4 to 1
data.replace({'target' : {4:1}},inplace=True)

In [15]:
#checking again labels distribution
data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [16]:
# 0->negative Tweet
# 1->positive Tweet

In [17]:
#Stemming
#It is the process of reducing the words into key words {actor,acting,acted...} -> act
port_stem = PorterStemmer()

In [18]:
#function handling stemming
def stemmer(content) :
    new_content = re.sub('[^a-zA-Z]',' ',content)
    new_content = new_content.lower()
    new_content = new_content.split()
    new_content = [port_stem.stem(word) for word in new_content if not word in stopwords.words("english")]
    new_content = ' '.join(new_content)
    return new_content

In [19]:
#create new column of stemmed Content
data['stemmed_content'] = data['text'].apply(stemmer)

In [20]:
#print the data
data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [21]:
#necessary data
data["stemmed_content"]

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object

In [22]:
#necessary data
data["target"]

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64

In [23]:
#seperating the targets and the stemmed content
X = data["stemmed_content"].values
Y = data["target"].values

In [24]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [25]:
print(Y)

[0 0 0 ... 1 1 1]


In [26]:
#splitting the data to test and train
#startify is for equal division of labels
#random state means how differently the data will be splitted (2 diff sets)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [27]:
#shapes of test train
print(X.shape,X_train.shape,X_test.shape)

(1600000,) (1280000,) (320000,)


In [28]:
print(X_test)

['mmangen fine much time chat twitter hubbi back summer amp tend domin free time'
 'ah may show w ruth kim amp geoffrey sanhueza'
 'ishatara mayb bay area thang dammit' ...
 'destini nevertheless hooray member wonder safe trip' 'feel well'
 'supersandro thank']


In [29]:
#converting texual data to numerical data
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)

In [30]:
import pickle
filename = "tfidf_vectorizer.pkl"
pickle.dump(vectorizer,open(filename,'wb'))

In [31]:
#converting text to numerical in X_test
X_test = vectorizer.transform(X_test)

In [32]:
print(X_test)

  (0, 420984)	0.17915624523539803
  (0, 409143)	0.31430470598079707
  (0, 398906)	0.3491043873264267
  (0, 388348)	0.21985076072061738
  (0, 279082)	0.1782518010910344
  (0, 271016)	0.4535662391658828
  (0, 171378)	0.2805816206356073
  (0, 138164)	0.23688292264071403
  (0, 132364)	0.25525488955578596
  (0, 106069)	0.3655545001090455
  (0, 67828)	0.26800375270827315
  (0, 31168)	0.16247724180521766
  (0, 15110)	0.1719352837797837
  (1, 366203)	0.24595562404108307
  (1, 348135)	0.4739279595416274
  (1, 256777)	0.28751585696559306
  (1, 217562)	0.40288153995289894
  (1, 145393)	0.575262969264869
  (1, 15110)	0.211037449588008
  (1, 6463)	0.30733520460524466
  (2, 400621)	0.4317732461913093
  (2, 256834)	0.2564939661498776
  (2, 183312)	0.5892069252021465
  (2, 89448)	0.36340369428387626
  (2, 34401)	0.37916255084357414
  :	:
  (319994, 123278)	0.4530341382559843
  (319995, 444934)	0.3211092817599261
  (319995, 420984)	0.22631428606830145
  (319995, 416257)	0.23816465111736276
  (319995, 3

In [33]:
print(X_train)
#The format is like mat[i][j] = value -> i means the row and j is the vector token and value is the similarity amount 

  (0, 443066)	0.4484755317023172
  (0, 235045)	0.41996827700291095
  (0, 109306)	0.3753708587402299
  (0, 185193)	0.5277679060576009
  (0, 354543)	0.3588091611460021
  (0, 436713)	0.27259876264838384
  (1, 160636)	1.0
  (2, 288470)	0.16786949597862733
  (2, 132311)	0.2028971570399794
  (2, 150715)	0.18803850583207948
  (2, 178061)	0.1619010109445149
  (2, 409143)	0.15169282335109835
  (2, 266729)	0.24123230668976975
  (2, 443430)	0.3348599670252845
  (2, 77929)	0.31284080750346344
  (2, 433560)	0.3296595898028565
  (2, 406399)	0.32105459490875526
  (2, 129411)	0.29074192727957143
  (2, 407301)	0.18709338684973031
  (2, 124484)	0.1892155960801415
  (2, 109306)	0.4591176413728317
  (3, 172421)	0.37464146922154384
  (3, 411528)	0.27089772444087873
  (3, 388626)	0.3940776331458846
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 390130)	0.22064742191076112
  (1279996, 434014)	0.2718945052332447
  (1279996, 318303)	0.21254698865277746
  (1279996, 237899)	0.2236567560099234
  (1279996, 2910

In [34]:
# Output the unique words
print("Vocabulary (unique words or tokens):")
print(vectorizer.get_feature_names_out())

Vocabulary (unique words or tokens):
['aa' 'aaa' 'aaaa' ... 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz']


In [35]:
#Building model with 1000 times traversal
model = LogisticRegression(max_iter=1000)

In [36]:
#training model
model.fit(X_train,Y_train)

In [37]:
#Accuracy Score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train,X_train_prediction)
print("Accuracy on training data",training_data_accuracy)

Accuracy on training data 0.79871953125


In [38]:
#Accuracy Score on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test,X_test_prediction)
print("Accuracy on testing data",testing_data_accuracy)

Accuracy on testing data 0.77668125


In [40]:
#some overfitting occured!

In [41]:
#saving trained model
import pickle

In [42]:
filename = "trained_model_nlp.sav"
pickle.dump(model,open(filename,'wb'))

In [43]:
#Using the saved model for future prediction
#loading the saved model

loaded_model = pickle.load(open(filename,'rb'))

In [44]:
X_new = X_train[0]
print(Y_train[0])
print(X_new)
prediction = loaded_model.predict(X_new)
print(prediction)

1
  (0, 443066)	0.4484755317023172
  (0, 235045)	0.41996827700291095
  (0, 109306)	0.3753708587402299
  (0, 185193)	0.5277679060576009
  (0, 354543)	0.3588091611460021
  (0, 436713)	0.27259876264838384
[1]


In [15]:
import re
import pickle
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the trained model
loaded_model = pickle.load(open("trained_model_nlp.sav", 'rb'))

# Load the TfidfVectorizer
loaded_vectorizer = pickle.load(open("tfidf_vectorizer.pkl", 'rb'))

# Initialize the Porter Stemmer
port_stem = PorterStemmer()

def preprocess_tweet(tweet):
    # Preprocess the tweet
    new_tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    new_tweet = new_tweet.lower()
    new_tweet = new_tweet.split()
    new_tweet = [port_stem.stem(word) for word in new_tweet if not word in stopwords.words("english")]
    new_tweet = ' '.join(new_tweet)
    return new_tweet

def predict_sentiment(tweet):
    # Preprocess the tweet
    processed_tweet = preprocess_tweet(tweet)
    # Vectorize the tweet
    vectorized_tweet = loaded_vectorizer.transform([processed_tweet])
    # Predict the sentiment
    prediction = loaded_model.predict(vectorized_tweet)
    return prediction[0]

# Example usage:
user_tweet = "I like you"
prediction = predict_sentiment(user_tweet)
if prediction == 1:
    print("Positive sentiment")
else:
    print("Negative sentiment")


Positive sentiment
