In [37]:
# Pacotes básicos
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pacotes do Scikit-learn para acesso a datasets, preparação, modelagem e avaliação
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Pacotes para modelos de aprendizado
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Pacote de balanceamento de classes desbalanceadas
# from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_curve, roc_auc_score, auc

In [38]:
#Reading dataset
df = pd.read_csv("treino.csv")
print(df.dtypes)
df.sample(10)

IDpassageiro     object
Nome             object
Deck             object
NumeroCabine      int64
PosicaoCabine    object
Procedencia      object
Destino          object
Idade             int64
VIP                bool
AllInclusive       bool
ServicoCabine     int64
Restaurante       int64
Shopping          int64
SPA               int64
ParquedeAguas     int64
Sobrevivente       bool
dtype: object


Unnamed: 0,IDpassageiro,Nome,Deck,NumeroCabine,PosicaoCabine,Procedencia,Destino,Idade,VIP,AllInclusive,ServicoCabine,Restaurante,Shopping,SPA,ParquedeAguas,Sobrevivente
847,4912_02,Bats Bane,F,299,P,China,China,31,True,False,484,385,8670,789,317,True
708,4396_01,Sallie Milton,F,1265,P,China,Poland,61,False,False,1352,572,9629,2629,261,True
826,6146_01,Scelava Groforacid,C,1085,P,Indonesia,Brazil,8,False,True,1258,384,5973,2161,190,False
601,4645_01,Aton Bacistion,A,1155,S,Norway,China,73,True,False,1146,1021,3770,508,223,True
653,4690_01,Mediah Dishocatal,E,382,P,Norway,Indonesia,8,True,False,1601,1198,14295,2559,581,False
621,1324_03,Lewise Gilleyons,A,915,P,Armenia,Portugal,66,False,True,1051,1424,9875,1320,575,True
570,2819_01,Stal Ocherman,B,173,S,Indonesia,China,31,False,True,1097,259,1518,967,408,False
809,9072_01,Gork Creke,H,290,S,Philippines,France,10,True,True,541,1230,7958,1352,103,True
131,4402_01,Louisy Waderoachez,H,297,P,Macedonia,Sierra Leone,52,False,True,688,964,13871,926,609,False
839,6921_03,Grum Stersetery,G,707,S,China,Honduras,9,False,True,151,1409,10992,2221,33,False


In [39]:
# Step 1: Count the number of unique nationalities
unique_nationalities = df['Procedencia'].nunique()
print(f"Number of unique nationalities: {unique_nationalities}")

# Step 2: Calculate the count and percentage of each nationality
nationality_counts = df['Procedencia'].value_counts()  # Counts of each nationality
nationality_percentages = df['Procedencia'].value_counts(normalize=True) * 100  # Percentage of each nationality

# Step 3: Display results
nationality_summary = pd.DataFrame({
    'Count': nationality_counts,
    'Percentage': nationality_percentages
})

# Display the summary table
print(nationality_summary)

Number of unique nationalities: 122
             Count  Percentage
Procedencia                   
China          181        18.1
Indonesia      121        12.1
Russia          58         5.8
Philippines     47         4.7
Brazil          34         3.4
...            ...         ...
Botswana         1         0.1
Kenya            1         0.1
Malawi           1         0.1
Samoa            1         0.1
Hungary          1         0.1

[122 rows x 2 columns]


In [40]:
threshold = 15
df['Procedencia'] = df['Procedencia'].apply(
    lambda x: x if nationality_counts[x] > threshold else 'Outra'
)

# Step 3: Display results
nationality_summary = pd.DataFrame({
    'Count': nationality_counts,
    'Percentage': nationality_percentages
})

# Display the summary table
print(nationality_summary)

df

             Count  Percentage
Procedencia                   
China          181        18.1
Indonesia      121        12.1
Russia          58         5.8
Philippines     47         4.7
Brazil          34         3.4
...            ...         ...
Botswana         1         0.1
Kenya            1         0.1
Malawi           1         0.1
Samoa            1         0.1
Hungary          1         0.1

[122 rows x 2 columns]


Unnamed: 0,IDpassageiro,Nome,Deck,NumeroCabine,PosicaoCabine,Procedencia,Destino,Idade,VIP,AllInclusive,ServicoCabine,Restaurante,Shopping,SPA,ParquedeAguas,Sobrevivente
0,5647_02,Chabih Matoltuble,D,193,P,Outra,Peru,61,False,True,239,1288,3380,1158,587,True
1,0107_01,Coobix Hart,A,1202,S,Outra,Argentina,31,True,False,466,6,11299,1598,506,True
2,4158_01,Doryn Noeley,G,107,P,Peru,China,49,True,True,1240,674,10989,806,463,True
3,1316_01,Sadrus Quelfly,G,545,S,Indonesia,Greenland,32,False,True,176,1186,8750,1438,379,True
4,9069_05,Mirfark Taketiatim,D,392,P,Indonesia,Brazil,5,False,False,90,1318,161,2853,203,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,6802_01,Eleb Disteady,G,1310,S,Outra,Brazil,61,True,True,1569,106,8090,2915,680,True
996,0313_01,Briney Mckinsond,G,1165,P,United States,Philippines,48,False,True,1455,70,8001,2858,361,True
997,9094_01,Aleen Hayertez,E,520,P,Brazil,Ukraine,41,False,False,471,841,11890,100,437,True
998,6594_01,Jord Schmondez,C,167,P,Poland,United States,75,False,True,1072,1133,13953,1355,384,False


In [41]:
# Codificação de Deck usando Label Encoding
df['Deck'] = df['Deck'].astype('category').cat.codes

# Codificação de PosicaoCabine (P = 1, S = 0)
df['PosicaoCabine'] = df['PosicaoCabine'].map({'P': 1, 'S': 0})
df.head(10)

Unnamed: 0,IDpassageiro,Nome,Deck,NumeroCabine,PosicaoCabine,Procedencia,Destino,Idade,VIP,AllInclusive,ServicoCabine,Restaurante,Shopping,SPA,ParquedeAguas,Sobrevivente
0,5647_02,Chabih Matoltuble,3,193,1,Outra,Peru,61,False,True,239,1288,3380,1158,587,True
1,0107_01,Coobix Hart,0,1202,0,Outra,Argentina,31,True,False,466,6,11299,1598,506,True
2,4158_01,Doryn Noeley,6,107,1,Peru,China,49,True,True,1240,674,10989,806,463,True
3,1316_01,Sadrus Quelfly,6,545,0,Indonesia,Greenland,32,False,True,176,1186,8750,1438,379,True
4,9069_05,Mirfark Taketiatim,3,392,1,Indonesia,Brazil,5,False,False,90,1318,161,2853,203,True
5,6756_02,Monah Pittson,1,852,1,China,Russia,72,False,True,1548,979,5356,1648,480,True
6,8056_02,Marrai Ausivetpul,2,1150,0,Brazil,Indonesia,69,True,True,603,792,13718,2869,364,False
7,2993_01,Nelly Gordanieves,7,358,0,China,Portugal,5,True,False,84,873,3754,1110,357,True
8,4564_01,Tera Merkins,4,710,0,China,Brazil,47,True,True,420,634,8517,423,487,True
9,0600_01,Muonon Ormler,2,877,0,Indonesia,United States,6,False,False,1360,16,13964,525,282,False


In [42]:
# Step 1: Calculate frequency of each nationality
nationality_frequency = df['Procedencia'].value_counts(normalize=True)

# Step 2: Map frequency to each row in the 'Nationality' column
df['Nationality_Frequency'] = df['Procedencia'].map(nationality_frequency)

# Display the first few rows to see the result
df[['Procedencia', 'Nationality_Frequency']].head()


Unnamed: 0,Procedencia,Nationality_Frequency
0,Outra,0.336
1,Outra,0.336
2,Peru,0.018
3,Indonesia,0.121
4,Indonesia,0.121


In [None]:
# Step 2: Calculate the correlation between frequency-encoded nationality and survival
correlation = df['Nationality_Frequency'].corr(df['Sobrevivente'])
print(f"Correlation between nationality frequency and survival: {correlation}")
