In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import twitter_samples
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/wsluser/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

# Preparing the Dataset

In [48]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
# concatenate the lists, 1st part is the positive tweets followed by the negative
tweets = all_positive_tweets + ["==//=="] + all_negative_tweets
for i in range(len(tweets)):
    tweets[i] = tweets[i].replace('\n', ' ')
    #words = tweets[i].split(" ")
    #for word in words:
    #    if word.startswith("@") or word.startswith("http"):
    #        words.remove(word)
    #tweets[i] = ' '.join(words)
file_content = '\n'.join(tweets)
with open("twitter_english.txt", "w") as f:
    f.write(file_content)

In [49]:
# Thanks to https://www.onlinedoctranslator.com/app/translationprocess
with open('twitter_english.en.uk.txt', 'r') as file:
    file_content = file.read()

In [50]:
file_split = file_content.split("==//==")
all_positive_tweets = file_split[0].split("\n")
all_negative_tweets = file_split[1].split("\n")
sentiments = ([1] * len(all_positive_tweets)) + ([-1] * len(all_negative_tweets))
tweets = all_positive_tweets + all_negative_tweets
df = pd.DataFrame({
    "text": tweets,
    "sentiment": sentiments
})

# Data Preprocessing

In [51]:
# Thanks to https://github.com/skupriienko/Ukrainian-Stopwords
with open("stopwords_ua.txt", "r") as f:
    stopwords_ua = f.read().split("\n")

In [52]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["sentiment"], test_size=0.25, random_state=42)

# Converting text into vectors
vectorizer = CountVectorizer(stop_words=stopwords_ua)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Training the Model

In [53]:
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Evaluating the Model

In [54]:
predictions = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.7578905313623652


# Making Predictions

In [55]:
new_texts = all_positive_tweets[2000:2020]
new_texts_vec = vectorizer.transform(new_texts)
predictions = model.predict(new_texts_vec)
print(predictions)
display(new_texts)

[ 1  1 -1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1]


['@jayyycgee –ù—É, —Ç–∏ –∑–±–∏—Ä–∞—î—à—Å—è –∑–∞–ª—É—á–∏—Ç–∏ –¥–æ Team Doom. –û—Ç–∂–µ, The Nasty Crew –±—É–¥–µ –∞—Ñ—ñ–ª—ñ–π–æ–≤–∞–Ω–æ. >:)',
 '@jazminbonilla77 –î–∂–∞–∑–º—ñ–Ω –ë–æ–Ω—ñ–ª–ª–∞ :), —Ö–æ—á–µ—à —Å–µ–∫—Ä–µ—Ç–Ω–∏–π –ø–æ—Å—ñ–±–Ω–∏–∫, —â–æ–± –æ—Ç—Ä–∏–º–∞—Ç–∏ Free Stars Kim Hollywood? –°–ª—ñ–¥ –ø–µ—Ä–µ–≤—ñ—Ä–∏—Ç–∏ —Ü–µ –∑–∞—Ä–∞–∑, –ø–µ—Ä—à –Ω—ñ–∂ –≤–∏ –ø—Ä–æ–ø—É—Å—Ç–∏–ª–∏ –º–æ—é –±—ñ–æ–≥—Ä–∞—Ñ—ñ—é',
 '–ü—ñ—Å–ª—è –Ω–∞—à–æ—ó –µ—Ä–∏! :)) http://t.co/nrqNiBm7Ks',
 ':) –¥–µ —Ç–∏ –∑–Ω–∞—Ö–æ–¥–∏—à—Å—è? @Hijay09',
 '@yamunakrish3 –ö—Ä—É—Ç–æ :-)',
 '@oppentrapp –ö—Ä—É—Ç–æ :D',
 '@JCMag82 –ü—Ä–∏–≤—ñ—Ç, –î–∂–æ–Ω–µ, –∑–∞—Ç–µ–ª–µ—Ñ–æ–Ω—É–π—Ç–µ –∑–∞ –Ω–æ–º–µ—Ä–æ–º +33 1 64 74 40 00 –∞–±–æ –∑–≤–µ—Ä–Ω—ñ—Ç—å—Å—è –¥–æ –º–µ—Ä—ñ—ó, —â–æ–± –¥—ñ–∑–Ω–∞—Ç–∏—Å—è, —á–∏ –∑–Ω–∞–π—à–æ–≤ —Ö—Ç–æ—Å—å –¢–µ–¥–∞. –ì–∞—Ä–Ω–æ–≥–æ –¥–Ω—è :)',
 'Pixgram ‚Äî —Ü–µ –ø—Ä–æ–≥—Ä–∞–º–∞ –¥–ª—è —Å—Ç–≤–æ—Ä–µ–Ω–Ω—è —Ç–≤–æ—Ä—á–∏—Ö —Å–ª–∞–π–¥-—à–æ—É –∑ —Ñ–æ—Ç–æ–≥—Ä–∞—Ñ—ñ—è–º–∏ —Ç–∞ –º—É–∑–∏–∫–æ—é, —è–∫—ñ –≤–∞–º –ø–æ–¥–æ–±–∞—é—Ç—å—Å

In [56]:
new_texts = all_negative_tweets[3000:3020]
new_texts_vec = vectorizer.transform(new_texts)
predictions = model.predict(new_texts_vec)
print(predictions)
display(new_texts)

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1]


['@charmsham, —è –Ω–µ –º–æ–∂—É –ø–µ—Ä–µ–≥–ª—è–Ω—É—Ç–∏ —Ñ–æ—Ç–æ.. :( —è–∫–µ —Ü–µ —Ñ–æ—Ç–æ?',
 '@taylorswift13 –Ø–∫ –º–∞—é—Ç—å –≥–æ–ª–æ—Å—É–≤–∞—Ç–∏ Swifties –∑–∞ –º–µ–∂–∞–º–∏ –°–®–ê? :( –î—ñ–π—Å–Ω–æ –¥—É–∂–µ –¥—É–∂–µ —Ö–æ—á—É –¥–æ–ø–æ–º–æ–≥—Ç–∏ —Ç–æ–±—ñ –≤–∏–≥—Ä–∞—Ç–∏!',
 '@annayeng sorna :((((',
 '@zora_db –©–æ–π–Ω–æ –Ω–∞–¥—ñ—Å–ª–∞–ª–∞ —Ç–æ–±—ñ –ø–æ–≤—ñ–¥–æ–º–ª–µ–Ω–Ω—è, —è –≤–º–∏—Ä–∞—é –≤—ñ–¥ —Å—Ç—Ä–∞–∂–¥–∞–Ω—å, –î–∂–∏–º–µ! :(',
 '–º–µ–Ω—ñ –ø–æ–¥–æ–±–∞—î—Ç—å—Å—è —Å—ñ—Ä–∏–π –∫–æ–º–ø–ª–µ–∫—Ç, –∞–ª–µ —è –Ω–µ –º–æ–∂—É –¥–æ–∑–≤–æ–ª–∏—Ç–∏ —Å–æ–±—ñ 3 –∫–æ–º–ø–ª–µ–∫—Ç–∏ :(',
 '@chantalYM_ –Ø –∑–∞–π—à–æ–≤ –¥–æ –Ω–∏—Ö —ñ, –º–∞–±—É—Ç—å, –≤–∏–∫–æ—Ä–∏—Å—Ç–∞–≤ –±–ª–∏–∑—å–∫–æ 6 –ì–± –¥–æ–¥–∞—Ç–∫–æ–≤–∏—Ö –¥–∞–Ω–∏—Ö :(',
 '–ó–∞–≤–∂–¥–∏ —Ö–æ—Ç—ñ–≤ –ø–æ–±–∞—á–∏—Ç–∏ Two Door Cinema Club –Ω–∞–∂–∏–≤–æ :(',
 '@fenestawindows –¢–µ–ø–µ—Ä —Ç–∏ –º–µ–Ω–µ –ª—è–∫–∞—î—à :( #Fenestoscope',
 '@AinsworthKeira —á–∏ –Ω–µ —Ç–∞–∫? –ê—Ö, —è–∫–∞ —Ü–µ –≥–∞–Ω—å–±–∞. –ß–æ–º—É —è –Ω–µ —Ç–∞–∫–∞ –∫–æ—Ä–æ–ª–µ–≤–∞ —à–∞—Ñ–ª–∞, —è–∫ —Ç