###  Task 0. 
Execute the notebook.

In [2]:
import nltk
from nltk.corpus import twitter_samples

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [5]:
from nltk.tag import pos_tag

In [6]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
from nltk.corpus import stopwords
import re, string
def process_tokens(tokens):

    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    for token, tag in pos_tag(tokens):
        if (re.search(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', token) or 
            re.search(r'(@[A-Za-z0-9_]+)', token) or re.search(r'#[A-Za-z0-9_]+', token)):
            continue

        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
   
        token = lemmatizer.lemmatize(token, pos)

        if token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [8]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = [process_tokens(tokens) for tokens in positive_tweet_tokens]
negative_cleaned_tokens_list = [process_tokens(tokens) for tokens in negative_tweet_tokens]

In [9]:
def get_token_dict(tokens):
    return dict([token, True] for token in tokens)
    
def get_tweets_for_model(cleaned_tokens_list):   
    return [get_token_dict(tweet_tokens) for tweet_tokens in cleaned_tokens_list]

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [10]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [11]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9983333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2069.2 : 1.0
                      :) = True           Positi : Negati =   1648.2 : 1.0
                     sad = True           Negati : Positi =     21.9 : 1.0
                     bam = True           Positi : Negati =     21.6 : 1.0
               community = True           Positi : Negati =     15.0 : 1.0
                 welcome = True           Positi : Negati =     15.0 : 1.0
                     x15 = True           Negati : Positi =     14.4 : 1.0
                    glad = True           Positi : Negati =     13.0 : 1.0
                  arrive = True           Positi : Negati =     11.8 : 1.0
           unfortunately = True           Negati : Positi =     11.7 : 1.0
None


In [12]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
from nltk.tokenize import word_tokenize

custom_tweet = "the service was so bad"

custom_tokens = process_tokens(word_tokenize(custom_tweet))

print(classifier.classify(get_token_dict(custom_tokens)))

Negative


In [14]:
def get_sentiment(text):
    custom_tokens = process_tokens(word_tokenize(text))
    return classifier.classify(get_token_dict(custom_tokens))

texts = ["bad", "service is bad", "service is really bad", "service is so terrible", "great service", "they stole my money"]
for t in texts:
    print(t, ": ", get_sentiment(t))

bad :  Negative
service is bad :  Negative
service is really bad :  Negative
service is so terrible :  Negative
great service :  Positive
they stole my money :  Negative


### Task 1. 
Re-train the classifier on a different set of data. For instance, use a dataset from HuggingFace or Kaggle.

In [15]:
import pandas as pd

movie = pd.read_csv('movie.csv')

In [16]:
movie['label'] = movie['label'].replace({0: 'Negative', 1: 'Positive'})

In [17]:
def get_token_dict(tokens):
    return dict([token, True] for token in tokens.split())

movie_nb = [(get_token_dict(text), label) for text, label in zip(movie['text'], movie['label'])]

In [18]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(movie_nb, test_size=0.2, random_state=4)

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.852375
Most Informative Features
                   Avoid = True           Negati : Positi =    129.0 : 1.0
                     Uwe = True           Negati : Positi =     40.3 : 1.0
                    2/10 = True           Negati : Positi =     39.0 : 1.0
            unwatchable. = True           Negati : Positi =     39.0 : 1.0
                    4/10 = True           Negati : Positi =     37.8 : 1.0
                    Boll = True           Negati : Positi =     31.0 : 1.0
             amateurish. = True           Negati : Positi =     31.0 : 1.0
                steaming = True           Negati : Positi =     31.0 : 1.0
               awful.<br = True           Negati : Positi =     30.3 : 1.0
                   WORST = True           Negati : Positi =     30.2 : 1.0
None


### Task 2. 
Try to use Logistic Regression classifier instead and compare the results with Naive Bayes.

In [20]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression

lg_classifier = SklearnClassifier(LogisticRegression(C = 0.1, max_iter = 400)).train(train_data)

print("Logistic Regression Accuracy is:", classify.accuracy(lg_classifier, test_data))
print("Naive Bayes Accuracy is:", classify.accuracy(classifier, test_data))

Logistic Regression Accuracy is: 0.888125
Naive Bayes Accuracy is: 0.852375
