In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
dir_datasets = '/content/drive/MyDrive/DataScience/Data Science Course/0.Datasets/{}'
df_review = pd.read_csv(dir_datasets.format('IMDB Dataset.csv'))

In [4]:
# input (X) --> Comentarios
# output (Y) --> Sentimientos
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
df_review.value_counts('sentiment')

sentiment
negative    25000
positive    25000
dtype: int64

In [None]:
'''
El dataset contiene 50000 filas, es demasiada la informacion que contiene, por lo que hay que desbalancear un poco
'''

In [6]:
df_positivo = df_review[df_review['sentiment']=='positive'][:9000]
df_negativo = df_review[df_review['sentiment']=='negative'][:1000]

df_review_des = pd.concat([df_positivo, df_negativo])
df_review_des.value_counts('sentiment')

sentiment
positive    9000
negative    1000
dtype: int64

In [7]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
df_review_bal, df_review_bal['sentiment'] = rus.fit_resample(df_review_des[['review']], 
                                 df_review_des['sentiment'])

df_review_bal.value_counts(['sentiment'])

sentiment
negative     1000
positive     1000
dtype: int64

**Separando data para entrenar y testear**

In [8]:
# importamos la libreria sklearn 
from sklearn.model_selection import train_test_split 

# entrenamos los datos con la informacion que tenemos en un DataFrame para entrenamiento y test
train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42) 

In [9]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

**Representacion de Texto (Bag of Words) --> Texto a Numero**



*   CountVectorizer: Frencuencia en que una palabra aparece en una oracion
*   TFIDF: La relevancia que tiene una palabra dentro de una oracion pero que no se encuentre muy repetitiva.



In [10]:
# Count Vectorizer 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
text  = ['Amo escribir codigo en Python. Amo el codigo en Python',
          'Odio escribir codigo en Java. Odio el codigo en Java.']

df = pd.DataFrame({'review': ['review1','review2'], 
                   'text': text})
cv = CountVectorizer()
cv_matrix = cv.fit_transform(df['text'])
df_dtm = pd.DataFrame(cv_matrix.toarray(), index=df['review'].values, columns=cv.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,el,en,escribir,java,odio,python
review1,2,2,1,2,1,0,0,2
review2,0,2,1,2,1,2,2,0


**TFIDF (term frequency - inverse document frecuency) : Representa un peso(relevancia) que tiene cada palabra**

In [11]:
# TFIDF 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
text  = ['Amo escribir codigo en Python. Amo el codigo en Python',
          'Odio escribir codigo en Java. Odio el codigo en Java.']

df = pd.DataFrame({'review': ['review1','review2'], 'text': text})
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])
df_dtm = pd.DataFrame(tfidf_matrix.toarray(), index=df['review'].values, columns=tfidf.get_feature_names_out())
df_dtm

Unnamed: 0,amo,codigo,el,en,escribir,java,odio,python
review1,0.553373,0.393729,0.196865,0.393729,0.196865,0.0,0.0,0.553373
review2,0.0,0.393729,0.196865,0.393729,0.196865,0.553373,0.553373,0.0


**Transformar data de texto a data numerica**

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)

test_x_vector = tfidf.transform(test_x)


In [15]:
train_x_vector

<1340x20198 sparse matrix of type '<class 'numpy.float64'>'
	with 116014 stored elements in Compressed Sparse Row format>

# **Support Vector Machines (SVM)**

In [16]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(train_x_vector, train_y)

**Testeo**

In [18]:
print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['"I did not like this movie at all I gave this movie away"'])))


['negative']
['positive']
['negative']
