In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Load data
data = pd.read_csv('Sentiment.csv')

In [13]:
data.head(20)

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone,clean_text
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito,rt nancyleegrahn everyone feel climate change ...
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,,rt scottwalker catch full gopdebate last night...
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,,rt tjmshow mention tamir rice gopdebate held c...
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada),rt robgeorge carly fiorina trending hour debat...
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona,rt danscavino gopdebate realdonaldtrump delive...
5,6,Ted Cruz,0.6332,yes,1.0,Positive,0.6332,None of the above,1.0,,...,228,,,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",,2015-08-07 09:54:44 -0700,629697194283499520,,Central Time (US & Canada),rt tedcruz first day rescind every illegal exe...
6,7,No candidate mentioned,1.0,yes,1.0,Negative,0.6761,FOX News or Moderators,1.0,,...,17,,,RT @warriorwoman91: I liked her and was happy ...,,2015-08-07 09:54:44 -0700,629697192383672320,North Georgia,Eastern Time (US & Canada),rt warriorwoman91 liked happy heard going mode...
7,8,No candidate mentioned,1.0,yes,1.0,Neutral,1.0,None of the above,1.0,,...,0,,,Going on #MSNBC Live with @ThomasARoberts arou...,,2015-08-07 09:54:44 -0700,629697192169750528,New York NY,Eastern Time (US & Canada),going msnbc live thomasaroberts around 2 pm et...
8,9,Ben Carson,1.0,yes,1.0,Negative,0.6889,None of the above,0.6444,,...,0,,,Deer in the headlights RT @lizzwinstead: Ben C...,,2015-08-07 09:54:44 -0700,629697190219243524,,Pacific Time (US & Canada),deer headlight rt lizzwinstead ben carson may ...
9,10,No candidate mentioned,0.4594,yes,0.6778,Negative,0.6778,None of the above,0.4594,,...,1,,,RT @NancyOsborne180: Last night's debate prove...,,2015-08-07 09:54:42 -0700,629697185093824512,,,rt nancyosborne180 last night debate proved go...


In [4]:
# Data preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Removing stop words and punctuation
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    print(filtered_tokens)
    return ' '.join(filtered_tokens)

In [9]:
print(preprocess_text("going to delhi."))

['going', 'delhi']
going delhi


In [11]:
data['clean_text'] = data['text'].apply(preprocess_text)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['rt', 'bettyfckinwhite', 'many', 'great', 'joke', 'twitter', 'tonight', 'gopdebates', 'many', 'stage']
['rt', 'rwsurfergirl', 'american', 'people', 'pick', 'next', 'president', 'united', 'state', 'fox', 'news', 'gopdebate', 'gopdebates']
['rt', 'leasavoy', 'overall', 'view', 'gopdebates', 'stayed', 'home', 'work', 'candidate', 'choice', 'changed', '1', 'iota', 'cruzcrew']
['rt', 'bettyfckinwhite', 'many', 'great', 'joke', 'twitter', 'tonight', 'gopdebates', 'many', 'stage']
['rt', 'happened', 'single', 'mention', 'votingrights', 'vra50', 'tonight', 'gopdebates']
['ca', 'wait', 'see', 'snl', 'skit', 'relating', 'gopdebates', 'candidate', 'make', 'sure', 'mention', 'god', 'every', 'sentence']
['many', 'great', 'joke', 'twitter', 'tonight', 'gopdebates', 'many', 'stage']
['rt', 'rwsurfergirl', 'american', 'people', 'pick', 'next', 'president', 'united', 'state', 'fox', 'news', 'gopdebate', 'gopdebates']
['rt', 'donniewahlbe

In [14]:
# Feature extraction
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(data['clean_text'])
y = data['sentiment']

In [15]:
print(y)

0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Model training
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Model evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6854054054054054


In [19]:
import joblib
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']