## Creación de un clasificador propio

In [1]:
# Cambiar variable a True para utilizar los ficheros que han sido 
# preprocesados

USAR_PREPROCESADO = True

In [2]:
import pandas as pd

if USAR_PREPROCESADO:
    cols = ['text', 'sentiment']
    train = pd.read_csv('clean_tweet.csv', header=None, names=cols).dropna()
else:
    cols = ['sentiment','id','date','query_string','user','text']
    train = pd.read_csv('data/training.1600000.processed.noemoticon.csv', header=None, names=cols, encoding = "ISO-8859-1").dropna()

In [3]:
train

Unnamed: 0,text,sentiment
0,awww bummer shoulda got david carr third day,0
1,upset updat facebook text might cri result sch...,0
2,dive mani time ball manag save rest go bound,0
3,whole bodi feel itchi like fire,0
4,behav mad see,0
...,...,...
1599995,woke school best feel ever,4
1599996,thewdb com cool hear old walt interview bmta,4
1599997,readi mojo makeov ask detail,4
1599998,happi th birthday boo alll time tupac amaru sh...,4


In [4]:
positivos = train[train['sentiment'] == 4]
negativos = train[train['sentiment'] == 0]

print("Positivos {}, Negativos {}".format(len(positivos), len(negativos)))

Positivos 796077, Negativos 796385


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sub_train_df = pd.concat([positivos[:3000], negativos[:3000]])
sub_train = sub_train_df.text.tolist()

vectorizer = TfidfVectorizer()

# Vector X e Y para el entrenamiento
vectors = vectorizer.fit_transform(sub_train)
targets = sub_train_df.sentiment.tolist()

features_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()

df = pd.DataFrame(denselist, columns=features_names)

In [6]:
df

Unnamed: 0,aaaaaaaa,aaaaaaaaaah,aaaaahhhh,aaaaand,aaaah,aaah,aaliyah,aalsmeer,aaru,aaw,...,zombi,zone,zoo,zoom,zuraidah,zurich,zz,zzz,zzzz,zzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
if USAR_PREPROCESADO:
    cols = ['text', 'sentiment']
    clean_test = pd.read_csv('clean_tweet_test.csv', header=None, names=cols)
else:
    cols = ['sentiment','id','date','query_string','user','text']
    clean_test = pd.read_csv('data/testdata.manual.2009.06.14.csv', header=None, names=cols)
    
# Ignoramos los etiquetados como Neutros (clase 2)
positivos = clean_test[clean_test['sentiment'] == 4]
negativos = clean_test[clean_test['sentiment'] == 0]
test = pd.concat([positivos, negativos])

vectors_test = vectorizer.transform(test.text.tolist())

## Support Vector Machine (SVM)

In [8]:
from sklearn import svm
from sklearn.metrics import classification_report

# Creación del modelo
classifier_linear = svm.SVC(kernel='rbf')

# Entrenamiento del modelo
classifier_linear.fit(vectors, targets)

# Predicción
prediction_linear = classifier_linear.predict(vectors_test)

In [9]:
report = classification_report(test.sentiment.tolist(), prediction_linear)
print(report)

              precision    recall  f1-score   support

           0       0.77      0.70      0.73       177
           4       0.73      0.80      0.76       182

    accuracy                           0.75       359
   macro avg       0.75      0.75      0.75       359
weighted avg       0.75      0.75      0.75       359



## Naive Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(vectors, targets)
prediction = nb.predict(vectors_test)

In [11]:
report = classification_report(test.sentiment.tolist(), prediction)
print(report)

              precision    recall  f1-score   support

           0       0.74      0.72      0.73       177
           4       0.73      0.76      0.75       182

    accuracy                           0.74       359
   macro avg       0.74      0.74      0.74       359
weighted avg       0.74      0.74      0.74       359



## RandomForest

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(vectors, targets)
prediction_rf = rf.predict(vectors_test)

In [15]:
report = classification_report(test.sentiment.tolist(), prediction_rf)
print(report)

              precision    recall  f1-score   support

           0       0.80      0.63      0.70       177
           4       0.70      0.85      0.77       182

    accuracy                           0.74       359
   macro avg       0.75      0.74      0.73       359
weighted avg       0.75      0.74      0.73       359

