In [1]:
import pandas as pd
import missingno as msno
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns

 # 1.   taking info about dataset

In [2]:
df = pd.read_csv('Sales.csv')
df.info()                                      


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Customer_Segment              8034 non-null   object 
 1   Sales_Before                  8478 non-null   float64
 2   Sales_After                   9233 non-null   float64
 3   Customer_Satisfaction_Before  8330 non-null   float64
 4   Customer_Satisfaction_After   8360 non-null   float64
 5   Purchase_Made                 9195 non-null   object 
dtypes: float64(4), object(2)
memory usage: 468.9+ KB


 ## 2.  shows first 10 rows

In [3]:
df.head(10)                                    

Unnamed: 0,Customer_Segment,Sales_Before,Sales_After,Customer_Satisfaction_Before,Customer_Satisfaction_After,Purchase_Made
0,High Value,240.548359,300.007567,74.684767,,No
1,High Value,246.862114,381.337555,100.0,100.0,Yes
2,High Value,156.978084,179.330464,98.780735,100.0,No
3,Medium Value,192.126708,229.278031,49.333766,39.811841,Yes
4,High Value,229.685622,,83.974852,87.738591,Yes
5,,135.573003,218.559988,58.075342,69.404918,No
6,High Value,191.713918,222.409356,89.967827,85.120975,Yes
7,Low Value,173.752555,213.168232,66.984711,67.881558,
8,High Value,208.308577,248.17883,95.36667,84.790294,Yes
9,High Value,235.071493,352.756872,72.919851,70.753225,No


   # 3. detecting missing values

In [4]:
df.isnull().sum()                            

Customer_Segment                1966
Sales_Before                    1522
Sales_After                      767
Customer_Satisfaction_Before    1670
Customer_Satisfaction_After     1640
Purchase_Made                    805
dtype: int64

# Dropping missing value

In [5]:
dropped_data = df.dropna()    # ! dropping not good choice here


In [6]:
dropped_data.isnull().sum() 

Customer_Segment                0
Sales_Before                    0
Sales_After                     0
Customer_Satisfaction_Before    0
Customer_Satisfaction_After     0
Purchase_Made                   0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Customer_Segment              8034 non-null   object 
 1   Sales_Before                  8478 non-null   float64
 2   Sales_After                   9233 non-null   float64
 3   Customer_Satisfaction_Before  8330 non-null   float64
 4   Customer_Satisfaction_After   8360 non-null   float64
 5   Purchase_Made                 9195 non-null   object 
dtypes: float64(4), object(2)
memory usage: 468.9+ KB


# Fill by iteration mean mode median


In [8]:
missing_value_list = df.columns[df.isnull().any()]
filled_by_iteration = df.copy()
for i  in missing_value_list:
  if filled_by_iteration[i].dtypes == 'object':
    filled_by_iteration[i].fillna(filled_by_iteration[i].mode()[0], inplace=True)   # ! fills object types
  else:
    filled_by_iteration[i].fillna(filled_by_iteration[i].mean(), inplace=True)        # ! fills number types with mean
    #filled_by_iteration[i].fillna(filled_by_iteration[i].median(), inplace=True)        # ! fills number types with median

filled_by_iteration.isnull().sum()
# filled_by_mean['Purchase_Made'].dtypes

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  filled_by_iteration[i].fillna(filled_by_iteration[i].mode()[0], inplace=True)   # ! fills object types
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  filled_by_iteration[i].fillna(filled_by_iteration[i].mean(), inplace=True)        # ! fills number types with mean


Customer_Segment                0
Sales_Before                    0
Sales_After                     0
Customer_Satisfaction_Before    0
Customer_Satisfaction_After     0
Purchase_Made                   0
dtype: int64

# Encoding filled_by_iteration dataset

# One hot encoding

In [9]:
filled_by_iteration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Customer_Segment              10000 non-null  object 
 1   Sales_Before                  10000 non-null  float64
 2   Sales_After                   10000 non-null  float64
 3   Customer_Satisfaction_Before  10000 non-null  float64
 4   Customer_Satisfaction_After   10000 non-null  float64
 5   Purchase_Made                 10000 non-null  object 
dtypes: float64(4), object(2)
memory usage: 468.9+ KB


In [10]:
filled_by_iteration['Customer_Segment'].value_counts()    # ! getting value frequency of column

Customer_Segment
Low Value       4665
Medium Value    2697
High Value      2638
Name: count, dtype: int64

In [11]:
filled_by_iteration['Purchase_Made'].value_counts()        # ! getting value frequency of column

Purchase_Made
Yes    5472
No     4528
Name: count, dtype: int64

In [12]:
one_hot_encoded = pd.get_dummies(filled_by_iteration,columns=None, dtype=int)  # ! hamma object type categorical value ga binary(0,1) qiymat beradi.

In [13]:
one_hot_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Sales_Before                   10000 non-null  float64
 1   Sales_After                    10000 non-null  float64
 2   Customer_Satisfaction_Before   10000 non-null  float64
 3   Customer_Satisfaction_After    10000 non-null  float64
 4   Customer_Segment_High Value    10000 non-null  int64  
 5   Customer_Segment_Low Value     10000 non-null  int64  
 6   Customer_Segment_Medium Value  10000 non-null  int64  
 7   Purchase_Made_No               10000 non-null  int64  
 8   Purchase_Made_Yes              10000 non-null  int64  
dtypes: float64(4), int64(5)
memory usage: 703.3 KB


# Label encoding

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
label_encoded = filled_by_iteration.copy()
columns_list = df.columns.to_list()                 # ! main dataset ni columnlarini list ga ovoldik
for i in  columns_list:
  if label_encoded[i].dtypes == 'object':
    label_encoded[i]= LabelEncoder().fit_transform(label_encoded[i])    # ! categorical value ga unique qiymat beradi 0,1,2,3

label_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Customer_Segment              10000 non-null  int64  
 1   Sales_Before                  10000 non-null  float64
 2   Sales_After                   10000 non-null  float64
 3   Customer_Satisfaction_Before  10000 non-null  float64
 4   Customer_Satisfaction_After   10000 non-null  float64
 5   Purchase_Made                 10000 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 468.9 KB


In [16]:
label_encoded.head(20)

Unnamed: 0,Customer_Segment,Sales_Before,Sales_After,Customer_Satisfaction_Before,Customer_Satisfaction_After,Purchase_Made
0,0,240.548359,300.007567,74.684767,73.872593,0
1,0,246.862114,381.337555,100.0,100.0,1
2,0,156.978084,179.330464,98.780735,100.0,0
3,2,192.126708,229.278031,49.333766,39.811841,1
4,0,229.685622,280.457952,83.974852,87.738591,1
5,1,135.573003,218.559988,58.075342,69.404918,0
6,0,191.713918,222.409356,89.967827,85.120975,1
7,1,173.752555,213.168232,66.984711,67.881558,1
8,0,208.308577,248.17883,95.36667,84.790294,1
9,0,235.071493,352.756872,72.919851,70.753225,0


#  Target encoding

In [25]:
target_encoded = filled_by_iteration.copy()
target_encoded = pd.get_dummies(target_encoded,columns=['Purchase_Made'], dtype=int)  # ! target variable ham categorical bulganligi uchun aval uni one hot bilan encode qildik

# target_encoded['Customer_Segment_target'] = target_encoded.groupby('Customer_Segment')['Purchase_Made'].transform('mean')

target_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Customer_Segment              10000 non-null  object 
 1   Sales_Before                  10000 non-null  float64
 2   Sales_After                   10000 non-null  float64
 3   Customer_Satisfaction_Before  10000 non-null  float64
 4   Customer_Satisfaction_After   10000 non-null  float64
 5   Purchase_Made_No              10000 non-null  int64  
 6   Purchase_Made_Yes             10000 non-null  int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 547.0+ KB


In [26]:
'yes' if 5>4  else 'no'

'yes'

In [28]:
for i in  range(1,11):
  if i%2 ==0:
    print(i)

  

2
4
6
8
10
