Exercises XP


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
# Exercise 1: Duplicate Detection And Removal

In [3]:
# Initialize Kaggle API
api = KaggleApi()
api.authenticate()  # Make sure you have properly set up your Kaggle API credentials

# Download the dataset
api.dataset_download_files("brendan45774/test-file", path=".", unzip=True)

# List the files in the current directory
files = os.listdir()

# Now you can directly access the dataset files without saving them to a specific folder
for file in files:
    if file.endswith('.csv'):
        print(f"Found CSV file: {file}")
        # Here you can perform further operations with the CSV file


Found CSV file: tested.csv


In [4]:
df_titanic = pd.read_csv('tested.csv')
df = df_titanic.copy()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [6]:
df.duplicated().sum()

0

In [7]:
# no duplicate rows based on all columns

In [8]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
# Exercise 2: Handling Missing Values

In [10]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [11]:
# columns with missings: 'Age', 'Fare', 'Cabin'

In [12]:
df = df.drop(columns='Cabin') # i'll drop 'Cabin' column because it isn't immportant

In [13]:
# i'll replace missings in 'Age' using KNN
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2)
age_clm = df[['Age']]
imputed_age_clm = imputer.fit_transform(age_clm)
df['Age'] = imputed_age_clm
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          418 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 36.1+ KB
None


In [14]:
# i'll replace 1 missing in flare using median because its abnormal distribution (we have outliers in this column)
df = df.fillna({'Fare': df['Fare'].median()})

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          418 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         418 non-null    float64
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 36.1+ KB


In [16]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.50000,0,0,330911,7.8292,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00000,1,0,363272,7.0000,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.00000,0,0,240276,9.6875,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.00000,0,0,315154,8.6625,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00000,1,1,3101298,12.2875,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,30.27259,0,0,A.5. 3236,8.0500,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.00000,0,0,PC 17758,108.9000,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.50000,0,0,SOTON/O.Q. 3101262,7.2500,S
416,1308,0,3,"Ware, Mr. Frederick",male,30.27259,0,0,359309,8.0500,S


In [17]:
# Exercise 3: Feature Engineering
# Creating 'Family Size' from 'SibSp' and 'Parch'
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Extracting 'Title' from the 'Name' column
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [18]:
# Exercise 4: Outlier Detection and Handling
def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    return data[(data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))]

def cap_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data.clip(lower=lower_bound, upper=upper_bound)

In [19]:
df['Fare'] = cap_outliers(df['Fare'])
df['Age'] = cap_outliers(df['Age'])

In [20]:
# Exercise 5: Data Standardization and Normalization
scaler = StandardScaler()
df['Fare'] = scaler.fit_transform(df[['Fare']])

scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

In [21]:
# Exercise 6: Feature Encoding
df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

In [22]:
# Exercise 7: Data Transformation for Age Feature
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child', 'Teenager', 'Adult', 'MiddleAged', 'Senior'])
df = pd.get_dummies(df, columns=['AgeGroup'])

In [23]:
# Inspect the final DataFrame
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,FamilySize,...,Title_Miss,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,AgeGroup_Child,AgeGroup_Teenager,AgeGroup_Adult,AgeGroup_MiddleAged,AgeGroup_Senior
0,892,0,3,"Kelly, Mr. James",0.60049,0,0,330911,-0.794366,1,...,False,True,False,False,False,True,False,False,False,False
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",0.845588,1,0,363272,-0.833539,2,...,False,False,True,False,False,True,False,False,False,False
2,894,0,2,"Myles, Mr. Thomas Francis",1.0,0,0,240276,-0.706576,1,...,False,True,False,False,False,True,False,False,False,False
3,895,0,3,"Wirz, Mr. Albert",0.453431,0,0,315154,-0.754999,1,...,False,True,False,False,False,True,False,False,False,False
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0.355392,1,1,3101298,-0.583747,3,...,False,False,True,False,False,True,False,False,False,False
