In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

-------------------- Exploratory Data Analysis --------------------

-> Univariate Analysis

In [None]:
df = pd.read_csv('Titanic.csv')

In [None]:
df.head

In [None]:
# Count Plot(Categorical Data)
sns.countplot(df['Sex'])
df['Sex'].value_counts().plot(kind='bar')


In [None]:
# Pie Chart(Categorical Data)
df['Sex'].value_counts().plot(kind='pie',autopct='%.2f')

In [None]:
# Histogram(Numerical Data)
import matplotlib.pyplot as plt
plt.hist(df['Age'])

In [None]:
# Dist Plot(Numerical Data)
sns.displot(df['Age'])

In [None]:
sns.displot(df['Age'])

In [None]:
# Box Plot(Numerical Data)
sns.boxplot(df['Age'])

In [None]:
df['Age'].min()

In [None]:
df['Age'].max()

In [None]:
df['Age'].mean()

In [None]:
df['Age'].skew()

-> Bivariate Analysis

In [None]:
titanic = df
tips = sns.load_dataset('tips')
flights = sns.load_dataset('flights')
iris = sns.load_dataset('Iris')

In [None]:
# Scatterplot(Numerical-Numerical)
sns.scatterplot(x='total_bill',y='tip',data=tips,hue=tips['sex'],
                style=tips['smoker'])

In [None]:
# Bar Plot(Numerical-Categorical)
sns.barplot(x=titanic['Pclass'],y=titanic['Fare'],hue=titanic['Sex'])

In [None]:
# Box Plot(Numerical-Categorical)
sns.boxplot(x=titanic['Sex'],y=titanic['Age'],hue=titanic['Survived'])

In [None]:
# Dist Plot(Categorical-Categorical)
sns.distplot(titanic[titanic['Survived']==0]['Age'])
sns.distplot(titanic[titanic['Survived']==1]['Age'])

In [None]:
# HeatMap(Categorical-Categorical)
sns.heatmap(pd.crosstab(titanic['Pclass'],titanic['Survived']))

In [None]:
titanic.groupby('Pclass').mean()['Survived']*100

In [None]:
sns.clustermap(pd.crosstab(titanic['sibsp'],titanic['Survived']))

In [None]:
# Pairplot(Multiple Numerical)
sns.pairplot(iris,hue='species')

In [None]:
# Lineplot(Multiple Numerical)
new = flights.groupby('year').sum().reset_index()
sns.lineplot(x=new['year'],y=new['passengers'])

In [None]:
sns.clustermap(flights.pivot_table(values='passengers',index='month',columns='year'))

-> Pandas Profiling

In [None]:
#from pandas_profiling import ProfileReport
#prof = ProfileReport(df)
#prof.to_file(output_file='output.html')

-------------------- Feature Engineering --------------------

--> Feature Transformation <--

--:Feature Scaling

1.Standardization

In [None]:
df = pd.read_csv('Social_Network_Ads.csv')

In [None]:
df2 = df.iloc[:,2:]

In [None]:
df

In [None]:
x_train,x_test,y_train,y_test = train_test_split(df.drop('Purchased',axis=1),
                                                    df['Purchased'],
                                                    test_size=0.3,
                                                    random_state=0)
x_train,x_test

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
scaler.mean_


In [None]:
x_train_scaled = pd.DataFrame(x_train_scaled,columns=x_train.columns)
x_test_scaled = pd.DataFrame(x_test_scaled,columns=x_test.columns)
x_train_scaled

In [None]:
np.round(x_train.describe(),1)

In [None]:
np.round(x_train_scaled.describe(),1)

In [None]:
fig , (ax1,ax2) = plt.subplots(ncols=2,figsize=(12,5))
ax1.scatter(x_train['Age'],x_train['EstimatedSalary'])
ax1.set_title("Before Scaling")
ax2.scatter(x_test_scaled['Age'],x_test_scaled['EstimatedSalary'])
ax2.set_title("After Scaling")
plt.show()

In [None]:
fig , (ax1,ax2) = plt.subplots(ncols=2,figsize=(12,5))
ax1.set_title("Before Scaling")
sns.kdeplot(x_train['Age'],ax=ax1)
sns.kdeplot(x_train['EstimatedSalary'],ax=ax1)
ax2.set_title("After Scaling")
sns.kdeplot(x_train_scaled['Age'],ax=ax2)
sns.kdeplot(x_train_scaled['EstimatedSalary'],ax=ax2)
plt.show

2. Normalization

In [None]:
df = pd.read_csv('wine_data.csv',header=None,usecols=[0,1,2])
df.columns=['Class label', 'Alcohol', 'Malic acid']

In [None]:
df

In [None]:
sns.kdeplot(df['Alcohol'])

In [None]:
sns.kdeplot(df['Malic acid'])

In [None]:
color_dict = {1: 'red', 3: 'green', 2: 'blue'}
sns.scatterplot(data=df, x='Alcohol', y='Malic acid', hue='Class label', palette=color_dict)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('Class label', axis=1),
                                                    df['Class label'],
                                                    test_size=0.3,
                                                    random_state=0)

X_train.shape, X_test.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
np.round(X_train.describe(), 1)

In [None]:
np.round(X_train_scaled.describe(), 1)

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

ax1.scatter(X_train['Alcohol'], X_train['Malic acid'])
ax1.set_title("Before Scaling")
ax2.scatter(X_train_scaled['Alcohol'], X_train_scaled['Malic acid'])
ax2.set_title("After Scaling")
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Before Scaling')
sns.kdeplot(X_train['Alcohol'], ax=ax1)
sns.kdeplot(X_train['Malic acid'], ax=ax1)

# after scaling
ax2.set_title('After Standard Scaling')
sns.kdeplot(X_train_scaled['Alcohol'], ax=ax2)
sns.kdeplot(X_train_scaled['Malic acid'], ax=ax2)
plt.show()

--: Encoding Categorial Data

1. Ordinal Encoding (Ordinal Data)

In [None]:
df = pd.read_csv('customer.csv')

In [None]:
df.sample(5)

In [None]:
df = df.iloc[:,2:]

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,0:2],df.iloc[:,-1],test_size=0.2)

In [None]:
x_train[0:10]

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])

In [None]:
oe.fit(x_train)

In [None]:
x_train = oe.transform(x_train)

In [None]:
x_train[0:10]

In [None]:
oe.categories_

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
le.fit(y_train)

In [None]:
le.classes_

In [None]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [None]:
y_train

2. One-Hot Encoding

In [None]:
df = pd.read_csv('cars.csv')
df.head()

In [None]:
df['owner'].value_counts()

In [None]:
# One-Hot Encoding using Pandas
pd.get_dummies(df,columns=['fuel','owner'])

In [None]:
# K-1 One-Hot Encoding
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

In [None]:
# One-Hot Encoding using sklearn

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)

print(X_train.head())

print()

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32)

X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
X_test_new = ohe.transform(X_test[['fuel','owner']])

print("x_train_new shape: ",X_train_new.shape)

np.hstack((X_train[['brand','km_driven']].values,X_train_new))

In [None]:
# One-Hot Encoding with Top-Categories

counts = df['brand'].value_counts()
df['brand'].nunique()
threshold = 100
repl = counts[counts <= threshold].index
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5)

3. Column Transformer

In [94]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [96]:
df = pd.read_csv('covid_toy.csv')
print(df.head())
print()
print(df.isnull().sum())

   age  gender  fever cough     city has_covid
0   60    Male  103.0  Mild  Kolkata        No
1   27    Male  100.0  Mild    Delhi       Yes
2   42    Male  101.0  Mild    Delhi        No
3   31  Female   98.0  Mild  Kolkata        No
4   65  Female  101.0  Mild   Mumbai        No

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64


In [98]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)
X_train

Unnamed: 0,age,gender,fever,cough,city
71,75,Female,104.0,Strong,Delhi
74,34,Female,104.0,Strong,Delhi
35,82,Female,102.0,Strong,Bangalore
81,65,Male,99.0,Mild,Delhi
50,19,Male,101.0,Mild,Delhi
...,...,...,...,...,...
17,40,Female,98.0,Strong,Delhi
51,11,Female,100.0,Strong,Kolkata
77,8,Female,101.0,Mild,Kolkata
61,81,Female,98.0,Strong,Mumbai


In [100]:
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])]
                            ,remainder='passthrough')

print("x_train: ",transformer.fit_transform(X_train).shape)

print("x_test",transformer.transform(X_test).shape)

x_train:  (80, 7)
x_test (20, 7)
