In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import os
import json
from typing import Tuple
from tqdm import tqdm
from regression_utils import *
from sklearn.linear_model import LinearRegression, SGDRegressor
sns.set(rc={"figure.figsize": (20, 10)})


In [2]:
sgd_params = {'alpha': 1, 'epsilon': 0.15, 'l1_ratio': 0.3, 'learning_rate': 'optimal', 'loss': 'squared_epsilon_insensitive', 'penalty': 'l2'}

In [3]:
PATH_TO_DATA = "../analysis/data"
REMOVE = ["positivos", "neutros", "negativos", "likes", "retweets", "day2", "day3"]
paredoes_df = {
    "paredao": [],
    "eliminado": [],
    "rejeicao": [],
    "pred": [],
    "pred_pct": [],
    "pesos": []
}

In [4]:
cols = None
def df_to_train_data(df: pd.DataFrame) -> Tuple[np.array, np.array]:
    return df.drop(columns=["paredao", "nome", "rejeicao"], axis=1).to_numpy(), df.drop(columns=df.columns[:-1], axis=1)

def run_paredoes() -> pd.DataFrame:
    # Treinando em n-1 paredões para predizer o n-ésimo
    paredoes = len(os.listdir(PATH_TO_DATA))

    for paredao in range(1, paredoes+1):
        # regressor = LinearRegression()
        regressor = SGDRegressor(**sgd_params)
        train_df, test_df, mean, std = get_train_test(paredao, normalize=True, drop_columns=REMOVE)

        x_train, y_train = df_to_train_data(train_df)
        x_test, y_test = df_to_train_data(test_df)

        y_train, y_test = np.ravel(y_train), np.ravel(y_test)

        regressor.fit(x_train, y_train)
        prediction = regressor.predict(x_test)

        test_df["predicao"] = prediction * std + mean
        test_df["rejeicao"] =  test_df["rejeicao"] * std + mean
        
        pred_elimination, pred_pct = test_df.sort_values(by="predicao", ascending=False)[["nome", "predicao"]].iloc[0] 
        true_elimination, true_pct = test_df.sort_values(by="rejeicao", ascending=False)[["nome", "rejeicao"]].iloc[0]

        paredoes_df["paredao"].append(paredao)
        paredoes_df["eliminado"].append(true_elimination)
        paredoes_df["rejeicao"].append(true_pct)
        paredoes_df["pred"].append(pred_elimination)
        paredoes_df["pred_pct"].append(pred_pct)
        paredoes_df["pesos"].append(regressor.coef_)

    return pd.DataFrame(paredoes_df), train_df.drop(columns=["paredao", "nome", "rejeicao"], axis=1).columns.to_list()


# Análise de importância de cada atributo

In [5]:
linear_reg_df, cols = run_paredoes()


In [6]:
linear_reg_df = linear_reg_df[linear_reg_df["eliminado"] == linear_reg_df["pred"]]

In [7]:
linear_reg_df["most_important"] = ""
for i in range(len(linear_reg_df)):
    coef = linear_reg_df["pesos"].iloc[i]
    linear_reg_df["most_important"].iloc[i] = cols[coef.argmax()]

In [8]:
linear_reg_df

Unnamed: 0,paredao,eliminado,rejeicao,pred,pred_pct,pesos,most_important
0,1,lucas_chumbo,0.7594,lucas_chumbo,0.546847,"[-0.1306719976217005, 0.14993066064248814, -0....",fora
1,2,petrix,0.8027,petrix,0.507881,"[-0.14344123183199492, 0.15747264407275582, 0....",fora
2,3,hadson,0.7971,hadson,0.463044,"[-0.1706452918534501, 0.15790194129509028, 0.0...",neutros_individual_pct
3,4,lucas,0.6262,lucas,0.458199,"[-0.14426375626894505, 0.14989393087429814, 0....",fora
5,6,guilherme,0.5607,guilherme,0.464838,"[-0.13263231772463355, 0.15904552655435303, -0...",fora
6,7,victor_hugo,0.8522,victor_hugo,0.289926,"[-0.14798501850103749, 0.19157003984431778, -0...",fora
7,8,pyong,0.517,pyong,0.597248,"[-0.08703892977642724, 0.017888030868848836, 0...",neutros_global_pct
8,9,daniel,0.8082,daniel,0.481588,"[-0.1302196456743085, 0.15430830981155433, -0....",fora
9,10,felipe_prior,0.5673,felipe_prior,0.89411,"[-0.14818040246027167, 0.1715952236651637, -0....",fora
11,12,marcela,0.4976,marcela,0.394965,"[-0.12727084092898935, 0.15313290045408454, -0...",fora


In [9]:
linear_reg_df.groupby("most_important")["paredao"].count()

most_important
fora                      9
neutros_global_pct        2
neutros_individual_pct    1
Name: paredao, dtype: int64