In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
dir_datasets = '/content/drive/MyDrive/DataScience/Data Science Course/0.Datasets/{}'
df_review = pd.read_csv(dir_datasets.format('IMDB Dataset.csv'))

In [None]:
# input (X) --> Comentarios
# output (Y) --> Sentimientos

In [4]:
df_review.value_counts('sentiment')

sentiment
negative    25000
positive    25000
dtype: int64

In [None]:
'''
El dataset contiene 50000 filas, es demasiada la informacion que contiene, por lo que hay que desbalancear un poco
'''

In [5]:
df_positivo = df_review[df_review['sentiment']=='positive'][:9000]
df_negativo = df_review[df_review['sentiment']=='negative'][:1000]

df_review_des = pd.concat([df_positivo, df_negativo])
df_review_des.value_counts('sentiment')

sentiment
positive    9000
negative    1000
dtype: int64

In [6]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment'] = rus.fit_resample(df_review_des[['review']], 
                                 df_review_des['sentiment'])

df_review_bal.value_counts(['sentiment'])

sentiment
negative     1000
positive     1000
dtype: int64

**Separando data para entrenar y testear**

In [7]:
# importamos la libreria sklearn 
from sklearn.model_selection import train_test_split 

# entrenamos los datos con la informacion que tenemos en un DataFrame para entrenamiento y test
train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)

In [8]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

**Representacion de Texto (Bag of Words) --> Texto a Numero**



*   CountVectorizer: Frencuencia en que una palabra aparece en una oracion
*   TFIDF: La relevancia que tiene una palabra dentro de una oracion pero que no se encuentre muy repetitiva.



In [9]:
# Count Vectorizer 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
text  = ['Amo escribir codigo en Python. Amo el codigo en Python',
          'Odio escribir codigo en Java. Odio el codigo en Java.']

df = pd.DataFrame({'review': ['review1','review2'], 
                   'text': text})
cv = CountVectorizer()
cv_matrix = cv.fit_transform(df['text'])
df_dtm = pd.DataFrame(cv_matrix.toarray(), index=df['review'].values, columns=cv.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,el,en,escribir,java,odio,python
review1,2,2,1,2,1,0,0,2
review2,0,2,1,2,1,2,2,0


**TFIDF (term frequency - inverse document frecuency) : Representa un peso(relevancia) que tiene cada palabra**

In [10]:
# TFIDF 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
text  = ['Amo escribir codigo en Python. Amo el codigo en Python',
          'Odio escribir codigo en Java. Odio el codigo en Java.']

df = pd.DataFrame({'review': ['review1','review2'], 'text': text})
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])
df_dtm = pd.DataFrame(tfidf_matrix.toarray(), index=df['review'].values, columns=tfidf.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,el,en,escribir,java,odio,python
review1,0.553373,0.393729,0.196865,0.393729,0.196865,0.0,0.0,0.553373
review2,0.0,0.393729,0.196865,0.393729,0.196865,0.553373,0.553373,0.0


**Transformar data de texto a data numerica**

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)

test_x_vector = tfidf.transform(test_x)


In [12]:
train_x_vector

<1340x19684 sparse matrix of type '<class 'numpy.float64'>'
	with 112422 stored elements in Compressed Sparse Row format>

**Support Vector Machines (SVM)**

In [13]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(train_x_vector, train_y)

**Testeo**

In [14]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['"I did not like this movie at all I gave this movie away"'])))


['negative']
['positive']
['negative']


**Arboles de decision**

In [15]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)

# **Naive Bayes**

In [16]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

**Logistic Regression**

In [17]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_x_vector, train_y)

# **Evaluacion del modelo**

In [18]:
# score del modelo
print(round(svc.score(test_x_vector, test_y)*100,2),'%')
print(round(dec_tree.score(test_x_vector, test_y)*100,2),'%')
print(round(gnb.score(test_x_vector.toarray(), test_y)*100,2),'%')
print(round(lr.score(test_x_vector, test_y)*100,2),'%')

80.91 %
68.18 %
60.3 %
81.97 %


In [19]:
# F1 score, toma en cuenta las variables recall y precision
# recall es todo el conjunto de datos
# precision es el conjunto de datos especificos de un tipo que se necesita hallar
from sklearn.metrics import f1_score

f1_score(test_y, svc.predict(test_x_vector), 
         labels=['positive', 'negative'],
         average=None)

array([0.81360947, 0.80434783])

In [20]:
# Reporte de clasificacion
from sklearn.metrics import classification_report

print(classification_report(test_y, svc.predict(test_x_vector), 
                      labels=['positive', 'negative']))

              precision    recall  f1-score   support

    positive       0.81      0.82      0.81       335
    negative       0.81      0.80      0.80       325

    accuracy                           0.81       660
   macro avg       0.81      0.81      0.81       660
weighted avg       0.81      0.81      0.81       660



In [22]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(test_y, svc.predict(test_x_vector), 
                      labels=['positive', 'negative'])

array([[275,  60],
       [ 66, 259]])

# **Opmitizacion del Modelo**

In [23]:
from sklearn.model_selection import GridSearchCV
# GridSearchCV es hacer una busqueda exhaustiva de que parametros que hemos colocado son los mejores para el modelo

parametros = {
    # C: es un parametro para un termino de error, cuanto error es soportable
    'C':[1,4,8,16,32],
    # kernel : es parte del sistema que hace todos los procesamientos, hay que especificar que funcion utilizar, si es lineal, polinomicas, etc
    'kernel':['linear', 'rbf']
    }

svc = SVC()
svc_grid = GridSearchCV(svc, parametros, cv=5) # cv son las validaciones cruzadas
svc_grid.fit(train_x_vector, train_y)

In [24]:
print(svc_grid.best_estimator_)
print(svc_grid.best_params_)

SVC(C=4)
{'C': 4, 'kernel': 'rbf'}


In [25]:
print(svc_grid.best_score_)

0.8186567164179104
