In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import os
import json
from typing import Tuple
from tqdm import tqdm
from regression_utils import *
from sklearn.linear_model import LinearRegression
sns.set(rc={"figure.figsize": (20, 10)})


In [2]:
PATH_TO_DATA = "../analysis/data"
REMOVE = ["positivos", "neutros", "negativos", "likes", "retweets", "day2", "day3"]
paredoes_df = {
    "paredao": [],
    "eliminado": [],
    "rejeicao": [],
    "pred": [],
    "pred_pct": [],
    "pesos": []
}

In [3]:
cols = None
def df_to_train_data(df: pd.DataFrame) -> Tuple[np.array, np.array]:
    return df.drop(columns=["paredao", "nome", "rejeicao"], axis=1).to_numpy(), df.drop(columns=df.columns[:-1], axis=1)

def run_paredoes() -> pd.DataFrame:
    # Treinando em n-1 paredões para predizer o n-ésimo
    paredoes = len(os.listdir(PATH_TO_DATA))

    # Somar mais um depois
    for paredao in range(1, paredoes):
        regressor = LinearRegression()
        train_df, test_df, mean, std = get_train_test(paredao, normalize=True, drop_columns=REMOVE)

        x_train, y_train = df_to_train_data(train_df)
        x_test, y_test = df_to_train_data(test_df)

        y_train, y_test = np.ravel(y_train), np.ravel(y_test)

        regressor.fit(x_train, y_train)
        prediction = regressor.predict(x_test)

        test_df["predicao"] = prediction * std + mean
        test_df["rejeicao"] =  test_df["rejeicao"] * std + mean
        
        pred_elimination, pred_pct = test_df.sort_values(by="predicao", ascending=False)[["nome", "predicao"]].iloc[0] 
        true_elimination, true_pct = test_df.sort_values(by="rejeicao", ascending=False)[["nome", "rejeicao"]].iloc[0]

        paredoes_df["paredao"].append(paredao)
        paredoes_df["eliminado"].append(true_elimination)
        paredoes_df["rejeicao"].append(true_pct)
        paredoes_df["pred"].append(pred_elimination)
        paredoes_df["pred_pct"].append(pred_pct)
        paredoes_df["pesos"].append(regressor.coef_)

    return pd.DataFrame(paredoes_df), train_df.drop(columns=["paredao", "nome", "rejeicao"], axis=1).columns.to_list()


# Análise de importância de cada atributo

In [4]:
linear_reg_df, cols = run_paredoes()


In [5]:
linear_reg_df = linear_reg_df[linear_reg_df["eliminado"] == linear_reg_df["pred"]]

In [6]:
linear_reg_df["most_important"] = ""
for i in range(len(linear_reg_df)):
    coef = linear_reg_df["pesos"].iloc[i]
    linear_reg_df["most_important"].iloc[i] = cols[coef.argmax()]

In [7]:
linear_reg_df

Unnamed: 0,paredao,eliminado,rejeicao,pred,pred_pct,pesos,most_important
0,1,lucas_chumbo,0.7594,lucas_chumbo,0.821162,"[105819968318966.69, 85881142572823.61, 140717...",negativos_individual_pct
1,2,petrix,0.8027,petrix,0.736366,"[97056432047169.02, 78768850630609.45, 1290634...",negativos_individual_pct
2,3,hadson,0.7971,hadson,0.685095,"[102780301311607.3, 83414216152603.22, 1366749...",negativos_individual_pct
3,4,lucas,0.6262,lucas,0.491879,"[-0.06307720520291525, 0.14149358684751284, -0...",neutros_global_pct
4,5,bianca_andrade,0.5309,bianca_andrade,0.348445,"[123453310013884.33, 100191972146802.0, 164165...",negativos_individual_pct
5,6,guilherme,0.5607,guilherme,0.527756,"[-0.023146259253458694, 0.14516414296388386, -...",neutros_global_pct
6,7,victor_hugo,0.8522,victor_hugo,0.346532,"[-0.07193048432835142, 0.16374597320414153, -0...",neutros_global_pct
7,8,pyong,0.517,pyong,0.616469,"[62016664609456.336, 50331351443630.23, 824683...",negativos_individual_pct
8,9,daniel,0.8082,daniel,0.648557,"[-0.10407626626465939, 0.21344152440934838, -0...",neutros_global_pct
9,10,felipe_prior,0.5673,felipe_prior,1.384012,"[0.08326041687716336, 0.13267750781171872, -0....",fora


In [8]:
linear_reg_df.groupby("most_important")["paredao"].count()

most_important
fora                        1
negativos_individual_pct    5
neutros_global_pct          6
Name: paredao, dtype: int64