In [215]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [216]:
def load_data():
    return pd.read_csv("TF_2023-FII.csv")

def countNaNValues(df):
    return df.isnull().sum()

def countDistinct(df, column):
    return df[column].nunique()

def remove_column(df, column):
    if column in df.columns:
        df.drop(column, axis=1, inplace=True)

def clean_and_convert_to_float(df, column_name):
    df[column_name] = df[column_name].str.replace('.', '', regex=False)
    df[column_name] = df[column_name].str.replace(',', '.', regex=False).astype(float)

def convert_column_percentage_to_float(df, column_name):
    df[column_name] = df[column_name].str.replace(',', '.').str.rstrip(' %').astype(float)

def convert_column_int_to_float(df, column_name):
    df[column_name] = df[column_name].astype(str).str.replace('.', '').astype(float)

def preprocessing(df):
    remove_column(df, 'PREÇO ATUAL (R$)')
    remove_column(df, 'LIQUIDEZ DIÁRIA (R$)')
    remove_column(df, 'DIVIDEND YIELD')
    remove_column(df, 'DY (12M) ACUMULADO')
    remove_column(df, 'DY (3M) MÉDIA')
    remove_column(df, 'VARIAÇÃO PREÇO')
    remove_column(df, 'PATRIMÔNIO LÍQUIDO')
    remove_column(df, 'VPA')
    remove_column(df, 'P/VPA')
    remove_column(df, 'DY PATRIMONIAL')
    remove_column(df, 'VARIAÇÃO PATRIMONIAL')
    remove_column(df, 'VOLATILIDADE')
    remove_column(df, 'TAX. ADMINISTRAÇÃO')
    remove_column(df, 'TAX. PERFORMANCE')
    remove_column(df, 'TAX. GESTÃO')

    #clean_and_convert_to_float(df, 'LIQUIDEZ DIÁRIA (R$)')
    clean_and_convert_to_float(df, 'P/VP')
    #clean_and_convert_to_float(df, 'PATRIMÔNIO LÍQUIDO')
    #clean_and_convert_to_float(df, 'VOLATILIDADE')
    #clean_and_convert_to_float(df, 'VPA')
    #clean_and_convert_to_float(df, 'P/VPA')

    convert_column_int_to_float(df, 'NUM. COTISTAS')

    convert_column_percentage_to_float(df, 'DY (12M) MÉDIA')
    #convert_column_percentage_to_float(df, 'VARIAÇÃO PREÇO')
    #convert_column_percentage_to_float(df, 'DY PATRIMONIAL')
    #convert_column_percentage_to_float(df, 'VARIAÇÃO PATRIMONIAL')
    
def print_distinct_values(dataframe, column_name):
    distinct_values = dataframe[column_name].unique()
    print(f"Distinct values in column '{column_name}':")
    for value in distinct_values:
        print(value)
    
def plot_dataframes_with_color(dataframes, y_column1, y_column2):
    plt.figure(figsize=(10, 6))
    colors = ['blue', 'orange', 'green', 'red', 'purple']  # Assigning colors for each DataFrame
    for idx, (df, color) in enumerate(zip(dataframes, colors)):
        plt.scatter(df[y_column1], df[y_column2], label=df['TIPO'].iloc[0], color=color)  # Scatter plot with 'TIPO' as circle color
    plt.xlabel(y_column1)
    plt.ylabel(y_column2)
    plt.title(f'{y_column1} vs {y_column2} with TIPO as color')
    plt.legend()
    plt.show()

In [217]:
df = load_data()

preprocessing(df)

#dfTIJOLO = df[df['TIPO'] == 'TIJOLO']
#dfPAPEL = df[df['TIPO'] == 'PAPEL']
#dfMISTO = df[df['TIPO'] == 'MISTO']
#dfSHOPPING = df[df['TIPO'] == 'SHOPPING']
#dfDESENVOLVIMENTO = df[df['TIPO'] == 'DESENVOLVIMENTO']

#correlation_matrix = df.corr()

#plt.figure(figsize=(8, 6))
#sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
#plt.title('Correlation Matrix')
#plt.show()

#print(dfSHOPPING)

#plot_dataframes_with_color([dfTIJOLO, dfPAPEL, dfMISTO, dfSHOPPING, dfDESENVOLVIMENTO], 'PATRIMÔNIO LÍQUIDO', 'NUM. COTISTAS')

  df[column_name] = df[column_name].astype(str).str.replace('.', '').astype(float)


In [238]:
print("size if", len(df))
columns_to_check = ['P/VP', 'DY (12M) MÉDIA', 'QUANT. ATIVOS', 'NUM. COTISTAS', 'TIPO']
df = df.dropna(subset=columns_to_check)
df = df[df['TIPO'] != '?']
print("size if", len(df))
# Splitting data into features and target
X = df[['P/VP','DY (12M) MÉDIA','QUANT. ATIVOS','NUM. COTISTAS']]
y = df['TIPO']

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a Decision Tree Classifier
clf = DecisionTreeClassifier()

# Training the classifier
clf.fit(X_train, y_train)

# Making predictions
y_pred = clf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Decision Tree Classifier: {accuracy:.2f}")

new_data = {
    'P/VP': [1.027, 4.52757442, 213783.30, 15546428.15, 0],
    'DY (12M) MÉDIA': [0.60, 1.01, 0.83, 0.99, 0],
    'QUANT. ATIVOS': [0, 0, 1, 2, 4],
    'NUM. COTISTAS': [373, 226183, 286, 370163, 220]
}

# Convert new_data to a DataFrame
new_df = pd.DataFrame(new_data)

# Make predictions for new rows
predicted_tipo = clf.predict(new_df)

# Display predicted 'TIPO' values
print("Predicted 'TIPO' values:")
print(predicted_tipo)

size if 344
size if 344
Accuracy of the Decision Tree Classifier: 0.62
Predicted 'TIPO' values:
['TIJOLO' 'MISTO' 'PAPEL' 'PAPEL' 'DESENVOLVIMENTO']
