In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("supershops.csv")

In [4]:
df.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [6]:
df.shape

(50, 5)

# Data Preprocessing

### Null Handling

In [7]:
df.isna().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [8]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [33]:
# Row Remove (Pandas Row Remove)
# Imputation (Mode, Median, Mean)
df.fillna(df["Transport"].mean(),inplace=True)

In [34]:
df.isna().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [35]:
# Encoding
#          - Label Encoder
#          - One Hot Encoder
#          - Ordinal Encoder

In [36]:
df.dtypes

Marketing Spend    float64
Administration     float64
Transport          float64
Area                object
Profit             float64
dtype: object

In [48]:
# LabelEncoder
df2 = df.copy()

In [49]:
from sklearn.preprocessing import LabelEncoder

In [50]:
encoder = LabelEncoder()

In [51]:
df2["Area"] = encoder.fit_transform(df2[["Area"]])

In [52]:
df2.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [53]:
# OrdinalEncoder

In [54]:
df3 = df.copy()

In [55]:
from sklearn.preprocessing import OrdinalEncoder

In [56]:
encoder = OrdinalEncoder()

In [57]:
df3["Area"] = encoder.fit_transform(df3[["Area"]]) #1D

In [58]:
df3.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1.0,192261.83
1,162597.7,151377.59,443898.53,0.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,1.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94


In [77]:
# One Hot Encoder

In [78]:
# from sklearn.preprocessing import OneHotEncoder

In [104]:
df4 = df.copy()

In [105]:
df4.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [108]:
df4 = pd.get_dummies(df4, prefix="Area",dtype="int")

In [109]:
df4.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area_Ctg,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,0,1,0
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,0,1
3,144372.41,118671.85,383199.62,182901.99,0,1,0
4,142107.34,91391.77,366168.42,166187.94,0,0,1


### Duplicate Data

In [112]:
df.isna().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [113]:
df.duplicated().sum()

0

In [None]:
#df.drop_duplicates() ##https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html

### Scaling

In [None]:
 #        - MinMaxScaler (Normalization) 
 #        - StandardScaler (Z - score) 
 #        - RobustScaler 
 #        - Log Transform (FunctionTransformer)/ Box Cox {loge(value)} zer0 -> 1 boxcox -> zero? ignore
 #        - MaxAbsoluteScaler

In [114]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [115]:
min_max_scaler = MinMaxScaler()

In [116]:
standard_scaler = StandardScaler()

In [117]:
robust_scaler = RobustScaler()

In [118]:
df5 = df.copy()

In [119]:
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [120]:
df5["Marketing Spend"] = min_max_scaler.fit_transform(df5[["Marketing Spend"]])

In [122]:
df5.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,136897.8,471784.1,Dhaka,192261.83
1,0.983359,151377.59,443898.53,Ctg,191792.06
2,0.927985,101145.55,407934.54,Rangpur,191050.39
3,0.873136,118671.85,383199.62,Dhaka,182901.99
4,0.859438,91391.77,366168.42,Rangpur,166187.94


In [124]:
df5["Administration"] = standard_scaler.fit_transform(df5[["Administration"]])

In [125]:
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.560753,471784.1,Dhaka,192261.83
1,0.983359,1.082807,443898.53,Ctg,191792.06
2,0.927985,-0.728257,407934.54,Rangpur,191050.39
3,0.873136,-0.096365,383199.62,Dhaka,182901.99
4,0.859438,-1.079919,366168.42,Rangpur,166187.94


In [126]:
df5["Transport"] = robust_scaler.fit_transform(df5[["Transport"]])

In [127]:
df5.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.560753,1.561661,Dhaka,192261.83
1,0.983359,1.082807,1.392082,Ctg,191792.06
2,0.927985,-0.728257,1.173378,Rangpur,191050.39
3,0.873136,-0.096365,1.022959,Dhaka,182901.99
4,0.859438,-1.079919,0.919389,Rangpur,166187.94


In [128]:
from sklearn.preprocessing import MaxAbsScaler

In [129]:
max_abs_scaler = MaxAbsScaler()

In [130]:
df5["Profit"] = max_abs_scaler.fit_transform(df5[["Profit"]])

In [131]:
df5.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.560753,1.561661,Dhaka,1.0
1,0.983359,1.082807,1.392082,Ctg,0.997557
2,0.927985,-0.728257,1.173378,Rangpur,0.993699
3,0.873136,-0.096365,1.022959,Dhaka,0.951317
4,0.859438,-1.079919,0.919389,Rangpur,0.864383


In [132]:
df.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


# Train Test Split

In [134]:
df.shape

(50, 5)

In [136]:
# ML_Model -> Train Data
# ML_Model -> Test Data
# Train 70%  Test 30%

In [137]:
from sklearn.model_selection import train_test_split

In [138]:
df2.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [143]:
X = df2.drop("Area", axis = 1) # Feature

In [144]:
y = df2["Area"] # Class

In [145]:
y.head(5)

0    1
1    0
2    2
3    1
4    2
Name: Area, dtype: int32

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30,random_state = 42)

In [147]:
X_train.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
6,134615.46,147198.87,127716.82,156122.51
41,27892.92,84710.77,164470.71,77798.83
46,1315.46,115816.21,297114.46,49490.75
47,0.0,135426.92,0.0,42559.73
15,165349.2,122616.84,261776.23,129917.04


In [149]:
y_train.head(5)

6     0
41    2
46    2
47    0
15    1
Name: Area, dtype: int32

# Model Training

### Logistic Regression

In [150]:
from sklearn.linear_model import LogisticRegression

In [151]:
lr = LogisticRegression()

In [152]:
lr.fit(X_train,y_train)

In [154]:
y_test_predicted = lr.predict(X_test)

In [156]:
from sklearn.metrics import accuracy_score

In [157]:
accuracy_score(y_test, y_test_predicted)

0.26666666666666666