# Clasificacion de Texto
## Jahzeel Ulises Mendez Diaz

In [58]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

### Stemming

Stemming es un método para reducir una palabra a su raíz o (en inglés) a un stem. Hay algunos algoritmos de stemming que ayudan en sistemas de recuperación de información. Stemming aumenta el recall que es una medida sobre el número de documentos que se pueden encontrar con una consulta. Por ejemplo una consulta sobre "bibliotecas" también encuentra documentos en los que solo aparezca "bibliotecario" porque el stem de las dos palabras es el mismo ("bibliotec"). [(ref)](https://es.wikipedia.org/wiki/Stemming)

In [59]:
stemmer = SnowballStemmer('english')
def tokenize_and_stem(text):
  tokens = word_tokenize(text.lower())
  stems = [stemmer.stem(token) for token in tokens if token.isalpha()]
  return ' '.join(stems)

### Lematización

La lematización es un proceso lingüístico que consiste en, dada una forma flexionada (es decir, en plural, en femenino, conjugada, etc), hallar el lema correspondiente. El lema es la forma que por convenio se acepta como representante de todas las formas flexionadas de una misma palabra. Es decir, el lema de una palabra es la palabra que nos encontraríamos como entrada en un diccionario tradicional: singular para sustantivos, masculino singular para adjetivos, infinitivo para verbos. Por ejemplo, decir es el lema de dije, pero también de diré o dijéramos; guapo es el lema de guapas; mesa es el lema de mesas. [(ref)](https://es.wikipedia.org/wiki/Lematizaci%C3%B3n)

In [60]:
lemmatizer = WordNetLemmatizer()
def tokenize_and_lematize(text):
  tokens = word_tokenize(text.lower())
  stems = [lemmatizer.lemmatize(token)for token in tokens if token.isalpha()]
  return ' '.join(stems)

### Dataframe

In [61]:
#Cargamos el dataframe
data = pd.read_csv("data/twitter_training.csv")
data = data.rename(columns={"2401":"No","Borderlands":"Game","Positive":"Class","im getting on borderlands and i will murder you all ,":"Text"})
data

Unnamed: 0,No,Game,Class,Text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [62]:
#Eliminamos filas con np.nan
data = data.dropna()

In [63]:
#Aplicamos la funcion de lematizacion y de stemming
nltk.download("punkt")
nltk.download("stopwords")
nltk.download('wordnet')

data["text_stem"] = data["Text"].apply(tokenize_and_stem)
data["text_lem"] = data["Text"].apply(tokenize_and_lematize)
data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["text_stem"] = data["Text"].apply(tokenize_and_stem)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

Unnamed: 0,No,Game,Class,Text,text_stem,text_lem
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,i am come to the border and i will kill you all,i am coming to the border and i will kill you all
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im get on borderland and i will kill you all,im getting on borderland and i will kill you all
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im come on borderland and i will murder you all,im coming on borderland and i will murder you all
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im get on borderland and i will murder you me all,im getting on borderland and i will murder you...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im get into borderland and i can murder you all,im getting into borderland and i can murder yo...
...,...,...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...,just realiz that the window partit of my mac i...,just realized that the window partition of my ...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,just realiz that my mac window partit is year ...,just realized that my mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,just realiz the window partit of my mac is now...,just realized the window partition of my mac i...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...,just realiz between the window partit of my ma...,just realized between the window partition of ...


### Primeros modelos

In [64]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data["text_lem"],data["Class"])

In [65]:
#Tranformamos el texto bag-of-words
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

#### Naive Bayes

In [66]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_counts,y_train)

In [67]:
from sklearn.metrics import confusion_matrix
X_test_counts = count_vect.transform(X_test)
print(clf.score(X_test_counts,y_test))
confusion_matrix(clf.predict(X_test_counts),y_test)

0.7172279582680144


array([[1845,  178,  200,  143],
       [ 603, 4655,  874,  621],
       [ 175,  267, 2743,  290],
       [ 566,  507,  807, 4025]], dtype=int64)

#### SVM

In [68]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)
clf.fit(X_train_counts,y_train)

In [69]:
print(clf.score(X_test_counts,y_test))
confusion_matrix(clf.predict(X_test_counts),y_test)

0.653711011406022


array([[1192,  135,  146,  113],
       [ 757, 4471,  880,  730],
       [ 339,  349, 2564,  370],
       [ 901,  652, 1034, 3866]], dtype=int64)

### Prueba con n-gramas

In [80]:
count_vect = CountVectorizer(ngram_range=(1,4))
X_train_ngram = count_vect.fit_transform(X_train)

#### Naive Bayes

In [81]:
clf = MultinomialNB()
clf.fit(X_train_ngram,y_train)

In [82]:
X_test_ngram = count_vect.transform(X_test)
print(clf.score(X_test_ngram,y_test))
confusion_matrix(clf.predict(X_test_ngram),y_test)

0.9029136710092437


array([[2638,   18,   19,   24],
       [ 268, 5351,  337,  288],
       [  77,   91, 4040,   93],
       [ 206,  147,  228, 4674]], dtype=int64)

#### SVM

In [83]:
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)
clf.fit(X_train_ngram,y_train)

In [84]:
X_test_ngram = count_vect.transform(X_test)
print(clf.score(X_test_ngram,y_test))
confusion_matrix(clf.predict(X_test_ngram),y_test)

0.86620898426942


array([[2416,   31,   35,   39],
       [ 245, 5111,  266,  259],
       [ 102,   96, 3808,   92],
       [ 426,  369,  515, 4689]], dtype=int64)