## Imports

In [21]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import sentiwordnet as swn
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report

In [2]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\hugop\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [3]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\hugop\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

## Gathering data

In [4]:
all_files_neg = os.listdir("data/neg/")
all_files_pos = os.listdir("data/pos/")

In [5]:
neg=[]
for fichier in all_files_neg:
    neg.append(open(f"data/neg/{fichier}", 'r').read())

In [6]:
pos=[]
for fichier in all_files_pos:
    pos.append(open(f"data/pos/{fichier}", 'r').read())

## Tokenize

In [7]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
english_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
for i in range(len(neg)):
    neg[i] = english_tokenizer.tokenize(neg[i])
for i in range(len(pos)):
    pos[i] = english_tokenizer.tokenize(pos[i])

In [10]:
tokenizer = RegexpTokenizer(r'\w+')

In [11]:
for i in range(len(neg)):
    for j in range(len(neg[i])):
        neg[i][j] = tokenizer.tokenize(neg[i][j])
for i in range(len(pos)):
    for j in range(len(pos[i])):
        pos[i][j] = tokenizer.tokenize(pos[i][j])

## Tag

In [12]:
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)

In [13]:
for i in range(len(neg)):
    for j in range(len(neg[i])):
        neg[i][j] = tagger.tag(neg[i][j])

In [14]:
for i in range(len(pos)):
    for j in range(len(pos[i])):
        pos[i][j] = tagger.tag(pos[i][j])

## Selecting only adverbs

In [15]:
for i in range(len(neg)):
    for j in range(len(neg[i])):
        tmp = []
        for x in range(len(neg[i][j])):
            if neg[i][j][x][1] == 'RB' or neg[i][j][x][1] == 'RBR' or neg[i][j][x][1] == 'RBS':
                tmp.append(neg[i][j][x][0])
        neg[i][j] = tmp

In [16]:
for i in range(len(pos)):
    for j in range(len(pos[i])):
        tmp = []
        for x in range(len(pos[i][j])):
            if pos[i][j][x][1] == 'RB' or pos[i][j][x][1] == 'RBR' or pos[i][j][x][1] == 'RBS':
                tmp.append(pos[i][j][x][0])
        pos[i][j] = tmp

In [17]:
for i in range(len(neg)):
    tmp = []
    for j in range(len(neg[i])):
        tmp += neg[i][j]
    neg[i] = tmp

In [18]:
for i in range(len(pos)):
    tmp = []
    for j in range(len(pos[i])):
        tmp += pos[i][j]
    pos[i] = tmp

## SentiWordNet

In [19]:
def score_words(_list):
    total_list_pos = []
    total_list_obj = []
    total_list_neg = []
    for i in range(len(_list)):
        list_pos = []
        list_obj = []
        list_neg = []
        for j in range(len(_list[i])):
            try:
                if ((list(swn.senti_synsets(_list[i][j]))[0].pos_score()) != 0.0):
                    list_pos.append(list(swn.senti_synsets(_list[i][j]))[0].pos_score())
                if ((list(swn.senti_synsets(_list[i][j]))[0].neg_score()) != 0.0):
                    list_neg.append(list(swn.senti_synsets(_list[i][j]))[0].neg_score())
                if ((list(swn.senti_synsets(_list[i][j]))[0].obj_score()) != 0.0):
                    list_obj.append(list(swn.senti_synsets(_list[i][j]))[0].obj_score())

            except IndexError:
                continue
        if len(list_pos) == 0:
            list_pos = 0
        if len(list_neg) == 0:
            list_neg = 0
        if len(list_obj) == 0:
            list_obj = 0
        total_list_pos.append(np.mean(list_pos))
        total_list_neg.append(np.mean(list_neg))
        total_list_obj.append(np.mean(list_obj))
    return total_list_pos ,total_list_neg, total_list_obj

In [22]:
neg_pos, neg_neg, neg_obj = score_words(neg)

In [23]:
pos_pos, pos_neg, pos_obj = score_words(pos)

In [24]:
score_pos = neg_pos + pos_pos
score_neg = neg_neg + pos_neg
score_obj = neg_obj + pos_obj

In [25]:
classe = [0]*1000 + [1]*1000

In [26]:
df = pd.DataFrame(list(zip(score_pos, score_neg, score_obj)), columns =['pos','neg','obj']) 

## Clustering

In [36]:
model=KMeans(n_clusters=2)
model.fit(df.values)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [37]:
clusters = model.predict(df.values)

In [38]:
print(classification_report(classe, clusters))

              precision    recall  f1-score   support

           0       0.53      0.43      0.48      1000
           1       0.52      0.62      0.57      1000

    accuracy                           0.53      2000
   macro avg       0.53      0.53      0.52      2000
weighted avg       0.53      0.53      0.52      2000

