Data Analysis and Profiling using Python and Pandas

In [1]:
# import modules
import pandas as pd

In [2]:
# create dataframe
df = pd.read_csv('..//raw_data//onlinefoods.csv')

In [3]:
# check the shape of the df
df.shape

(388, 13)

In [4]:
# check the available information of the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         388 non-null    int64  
 1   Gender                      388 non-null    object 
 2   Marital Status              388 non-null    object 
 3   Occupation                  388 non-null    object 
 4   Monthly Income              388 non-null    object 
 5   Educational Qualifications  388 non-null    object 
 6   Family size                 388 non-null    int64  
 7   latitude                    388 non-null    float64
 8   longitude                   388 non-null    float64
 9   Pin code                    388 non-null    int64  
 10  Output                      388 non-null    object 
 11  Feedback                    388 non-null    object 
 12  Unnamed: 12                 388 non-null    object 
dtypes: float64(2), int64(3), object(8)


In [5]:
# check sample data of the dataframe
display(df)

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,Unnamed: 12
0,20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes
1,24,Female,Single,Student,Below Rs.10000,Graduate,3,12.9770,77.5773,560009,Yes,Positive,Yes
2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes
3,22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive,Yes
4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.9850,77.5533,560010,Yes,Positive,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,23,Female,Single,Student,No Income,Post Graduate,2,12.9766,77.5993,560001,Yes,Positive,Yes
384,23,Female,Single,Student,No Income,Post Graduate,4,12.9854,77.7081,560048,Yes,Positive,Yes
385,22,Female,Single,Student,No Income,Post Graduate,5,12.9850,77.5533,560010,Yes,Positive,Yes
386,23,Male,Single,Student,Below Rs.10000,Post Graduate,2,12.9770,77.5773,560009,Yes,Positive,Yes


In [6]:
# check if there are any null values in the dataset
df.isnull().sum()

Age                           0
Gender                        0
Marital Status                0
Occupation                    0
Monthly Income                0
Educational Qualifications    0
Family size                   0
latitude                      0
longitude                     0
Pin code                      0
Output                        0
Feedback                      0
Unnamed: 12                   0
dtype: int64

In [7]:
# rename columns
df.rename(columns = {'Age': 'age', 'Gender': 'gender', 'Marital Status': 'marital_status', 'Occupation': 'occupation',\
            'Monthly Income': 'monthly_income', 'Educational Qualifications': 'educational_qualifications',\
                'Family size': 'family_size', 'Pin code': 'pin_code', 'Output': 'output', 'Feedback': 'feedback'}, inplace=True)

In [8]:
# validate that the changes in column names were performed
df.sample(5)

Unnamed: 0,age,gender,marital_status,occupation,monthly_income,educational_qualifications,family_size,latitude,longitude,pin_code,output,feedback,Unnamed: 12
185,28,Male,Married,Employee,More than 50000,Post Graduate,1,12.9925,77.5633,560021,Yes,Positive,Yes
229,32,Male,Married,Employee,25001 to 50000,Graduate,3,12.9706,77.6529,560075,Yes,Positive,Yes
352,29,Female,Married,Employee,25001 to 50000,Graduate,4,12.9783,77.6408,560038,No,Positive,No
286,25,Female,Single,Student,10001 to 25000,Graduate,2,12.9757,77.5586,560023,Yes,Positive,Yes
118,32,Female,Married,Employee,25001 to 50000,Graduate,5,12.9261,77.6221,560034,Yes,Positive,Yes


In [9]:
# drop unnecesary columns
df.drop(columns=['Unnamed: 12', 'monthly_income', 'latitude', 'longitude', 'pin_code'], inplace=True)

In [10]:
df.sample(5)

Unnamed: 0,age,gender,marital_status,occupation,educational_qualifications,family_size,output,feedback
100,24,Female,Single,Student,Post Graduate,3,No,Positive
77,21,Male,Single,Student,Graduate,4,Yes,Positive
249,23,Female,Single,Employee,Post Graduate,2,Yes,Positive
34,22,Female,Single,Student,Post Graduate,2,Yes,Positive
339,21,Male,Single,Student,Graduate,2,No,Negative


In [36]:
df.gender.unique()

array(['Female', 'Male'], dtype=object)

In [37]:
df.marital_status.unique()

array(['Single', 'Married', 'Prefer not to say'], dtype=object)

In [11]:
df.occupation.unique()

array(['Student', 'Employee', 'Self Employeed', 'House wife'],
      dtype=object)

In [12]:
df.educational_qualifications.unique()

array(['Post Graduate', 'Graduate', 'Ph.D', 'Uneducated', 'School'],
      dtype=object)

In [13]:
df.output.unique()

array(['Yes', 'No'], dtype=object)

In [14]:
df.feedback.unique()

array(['Positive', 'Negative '], dtype=object)

In [18]:
# export cleaned data to transformed folder
df.to_csv('..//transformed_data//cleaned_onlinefoods.csv', index=False)