#### [Formação do Dataframe](#dataframe)
#### [Train Test Split](#split)
#### [Vetorização](#vetorizacao)
#### [Rede Neural](#rede)

### Imports

In [65]:
import torch
import torch.nn as nn
import torch.nn.functional as F


import numpy as np
from collections import Counter

import pandas as pd
import seaborn as sns
from os import listdir


from sklearn.feature_extraction.text import TfidfVectorizer

### Arquivos

In [3]:
listdir()

['.ipynb_checkpoints',
 'labels.txt',
 'mini_projeto_sentiment_analysis.ipynb',
 'reviews.txt',
 'Sentiment Analysis Intro.ipynb',
 'sentiment_network.png',
 'sentiment_network_2.png',
 'sentiment_network_pos.png']

### Formação do Dataframe <a id="dataframe">

In [7]:
df = pd.read_csv('reviews.txt', header=None, names=['text'])

In [21]:
pd.set_option('display.max_colwidth',150)
df.head()

Unnamed: 0,text
0,bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life such as teachers . my years in the teach...
1,story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orche...
2,homelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once co...
3,airport starts as a brand new luxury plane is loaded up with valuable paintings such belonging to rich businessman philip stevens james st...
4,brilliant over acting by lesley ann warren . best dramatic hobo lady i have ever seen and love scenes in clothes warehouse are second to none . ...


In [22]:
df_rev = pd.read_csv('labels.txt', header=None, names=['labels'])
df_rev.head()

Unnamed: 0,labels
0,positive
1,negative
2,positive
3,negative
4,positive


In [23]:
df['labels'] = df_rev.labels
df.head()

Unnamed: 0,text,labels
0,bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life such as teachers . my years in the teach...,positive
1,story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orche...,negative
2,homelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once co...,positive
3,airport starts as a brand new luxury plane is loaded up with valuable paintings such belonging to rich businessman philip stevens james st...,negative
4,brilliant over acting by lesley ann warren . best dramatic hobo lady i have ever seen and love scenes in clothes warehouse are second to none . ...,positive


In [68]:
df.labels = df.labels.apply(lambda x: 1 if x == 'positive' else 0)
df.head()

Unnamed: 0,text,labels
0,bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life such as teachers . my years in the teach...,1
1,story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orche...,0
2,homelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once co...,1
3,airport starts as a brand new luxury plane is loaded up with valuable paintings such belonging to rich businessman philip stevens james st...,0
4,brilliant over acting by lesley ann warren . best dramatic hobo lady i have ever seen and love scenes in clothes warehouse are second to none . ...,1


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
text      25000 non-null object
labels    25000 non-null int64
dtypes: int64(1), object(1)
memory usage: 390.7+ KB


In [70]:
df.labels.value_counts()

1    12500
0    12500
Name: labels, dtype: int64

### Train Test Split <a id="split">

In [71]:
df_train = df.iloc[:-1000]
df_test = df.iloc[-1000:]

In [72]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 2 columns):
text      24000 non-null object
labels    24000 non-null int64
dtypes: int64(1), object(1)
memory usage: 375.1+ KB


In [73]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 24000 to 24999
Data columns (total 2 columns):
text      1000 non-null object
labels    1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


### Vetorização <a id="vetorizacao">

In [49]:
len(vect.get_feature_names())

74046

In [52]:
def test_df_max(max_df=[1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4]):
    for m in max_df:
        vect = TfidfVectorizer(max_df = m)
        print(f'shape test for max {m}: {vect.fit_transform(df.text).shape}')

In [57]:
def test_df_min(min_df=[1, 10, 100, 1000, 10000]):
    for m in min_df:
        vect = TfidfVectorizer(min_df = m)
        print(f'shape test for min {m}: {vect.fit_transform(df.text).shape}')

### Tamanhos da Saída para Diferentes min, max doc. freq
##### shape test for max 1.0: (25000, 74046)
##### shape test for max 0.9: (25000, 74041)
##### shape test for max 0.8: (25000, 74037)
##### shape test for max 0.7: (25000, 74035)
##### shape test for max 0.6: (25000, 74030)
##### shape test for max 0.5: (25000, 74020)
##### shape test for max 0.4: (25000, 74004)
##### shape test for max 0.3: (25000, 73985)
##### shape test for max 0.2: (25000, 73940)
##### shape test for max 0.1: (25000, 73832)
##### shape test for max 0.05: (25000, 73618)
##### _
##### shape test for min 1: (25000, 74046)
##### shape test for min 10: (25000, 18278)
##### shape test for min 100: (25000, 3793)
##### shape test for min 1000: (25000, 522)
##### shape test for min 10000: (25000, 42)


In [79]:
vect = TfidfVectorizer(min_df=100, max_df=0.5)

In [82]:
vect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 0.5,
 'max_features': None,
 'min_df': 100,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [80]:
x = vect.fit_transform(df.text)

In [81]:
x.shape

(25000, 3767)

In [42]:
word_c = Counter()
for w in vect.get_feature_names():
    word_c[w] +=1


In [48]:
word_c.most_common()

('wrinklies', 1)

In [83]:
vect.get_feature_names()[:10]

['abandoned',
 'abilities',
 'ability',
 'able',
 'about',
 'above',
 'absence',
 'absolute',
 'absolutely',
 'absurd']

### Rede Neural <a id="rede">

In [87]:
class Model(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(Model, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.sigmoid(x)
        return x

In [93]:
type(x.todense())

numpy.matrix

In [95]:
torch.tensor(x.todense())

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.2242, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64)

In [None]:
# B is batch size D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
B, D_in, H, D_out = 1, x.shape[1], 1000, 1

# Create random Tensors to hold inputs and outputs
x = torch.tensor(B, D_in)


# Construct our model by instantiating the class defined above
model = Model(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(50):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()