# Python Data Cleaning

## 1.0 Importing our Libraries

In [128]:
# Importing the Pandas Library
import pandas as pd

# Importing the Numpy Library
import numpy as np



In [129]:
df =pd.read_excel('electronics.xlsx')
df.head(2)

Unnamed: 0,Timestamp,product:,Product Parameters,Shop name,Shop type,Respective price,Ram,Hdd size,Screen size
0,2017-09-27 15:30:44.686000,Flash Disk,,Ycd,retail,6788,,8gb,
1,2017-09-27 15:55:28.424000,Flash disk,,Link electronics,retail,1000,,8gb,


## 1.1 Validity

In [130]:
# Dropping the irrelevant columns i.e. Timestamp, Shop name and Shop type	
# Irrelevant data are those that are not actually needed, and don’t fit under 
# the context of the problem we’re trying to solve.
to_drop = ["Timestamp" , "Shop name" , "Shop type"]
df.drop(to_drop, inplace=True, axis=1)
df.head(2)

Unnamed: 0,product:,Product Parameters,Respective price,Ram,Hdd size,Screen size
0,Flash Disk,,6788,,8gb,
1,Flash disk,,1000,,8gb,


## 1.2 Completeness

### 1.2.1 How to check missing values


In [131]:
#Checking/ Counting Missing Values 
# Checking if there is any missing value across each column
df.isnull().any()


product:              False
Product Parameters     True
Respective price      False
Ram                    True
Hdd size               True
Screen size            True
dtype: bool

In [132]:
# Checking if there is any missing value in dataframe as a whole
df.isnull()

Unnamed: 0,product:,Product Parameters,Respective price,Ram,Hdd size,Screen size
0,False,True,False,True,False,True
1,False,True,False,True,False,True
2,False,True,False,True,False,True
3,False,True,False,True,False,True
4,False,True,False,True,False,True
...,...,...,...,...,...,...
62,False,False,False,False,False,False
63,False,False,False,False,False,False
64,False,False,False,False,False,True
65,False,False,False,False,True,True


In [133]:
# Checking how many missing values there are across each column
df.isnull().sum()

product:               0
Product Parameters    22
Respective price       0
Ram                   33
Hdd size              20
Screen size           46
dtype: int64

In [134]:
# Or we can do a quick check to see if we have any missing values at all
print(df.isnull().values.any())

True


### 1.2.2 How to Handle Missing Values

In [135]:

# Dropping Missing Values 
# If there are only a few null values and you know that deleting values 
# will not cause adverse effects on your result, 
# remove them from your DataFrame and store that in a new DataFrame
# Droppping all 
clean_df = df.dropna()
print(clean_df)

   product:    Product Parameters Respective price  Ram Hdd size  Screen size
34   Laptop                Lenovo            26999    2      500         15.6
35   Laptop            Asus t100t            42500    2       64         10.1
36   Laptop                 Asuss            25500    2       32         11.6
37   Laptop                  Asus            25500    2       32         11.6
38   Laptop   Lenovo ideapad mini            26000    2       32         11.6
39   Laptop                    Hp            35500    4      500         15.6
40   Laptop                    Hp            19000    2      320         15.0
41   Laptop                    Hp            24000    2      500         15.0
42   Laptop               Lg atom            20000    4      320         14.0
43   Laptop           Hp notebook            45000    4      500         15.0
44   Laptop                  Asus            28000    4      500         15.6
45   Laptop         Hp folio 9470            35000    4      500

In [136]:
# Verifying that you no longer have any null values by running 
print(clean_df.isnull().sum())

product:              0
Product Parameters    0
Respective price      0
Ram                   0
Hdd size              0
Screen size           0
dtype: int64


In [137]:
# Dropping all rows that have all NA values
df1 = df.dropna(how="all")
df1.head(58)
df =df1

In [138]:
# We can also put a limitation on how many non-null values need to be in a row 
# we can retain the data that has at least 2 non-null values as shown below
# Uncomment the 2 lines below after running the previous lines
df2 = df.dropna(thresh=2)
print(df2)

      product: Product Parameters Respective price  Ram Hdd size  Screen size
0   Flash Disk                NaN             6788  NaN      8gb          NaN
1   Flash disk                NaN             1000  NaN      8gb          NaN
2        Flash                NaN             1000  NaN      8gb          NaN
3        Flash                NaN             1000  NaN      8GB          NaN
4        Flash                NaN              800  NaN      8gb          NaN
..         ...                ...              ...  ...      ...          ...
62      Laptop  HP elitebook 2540            21000  4GB      500         14.0
63      Laptop       hp i7 laptop            20000    4      320         14.0
64      Laptop               Asus           125000    4      500          NaN
65      Laptop               R450                2    2      NaN          NaN
66      Laptop     Lenovo ideapad              100    2      500          NaN

[67 rows x 6 columns]


In [139]:
# IMPUTING Missing Values
# Imputing the attribute mean for all missing values
# Mean imputation replaces missing values with the mean value of that feature/variable. 


# imputing the mean 
subjects_df_mean['Score'] = subjects_df_mean['Score'].fillna((subjects_df_mean['Score'].mean()))

# printing out our updated dataframe
print(subjects_df_mean) 


NameError: name 'subjects_df_mean' is not defined

## 1.3 Consistency

In [None]:
df.shape

In [None]:
# Example 1: Duplicates
# Duplicates are data points that are repeated in your dataset. 
# These should be simply removed.
df.duplicated().sum()



In [None]:
df.loc[df.duplicated(keep=False), : ]

In [None]:
#Dropping duplicated rows
df.drop_duplicates(keep="first").shape



## 1.4  Uniformity

In [None]:
df.head(4)

In [140]:
df = df.rename(columns={'product:':'Product', 'product_parameters' : 'Product_Parameters', 'respective_price' : 'Respective_Price', 'ram': 'Ram','hdd_size': 'Hdd_Size', 'screen_size' : 'Screen_Size'}) 

In [None]:
#Standardization - Fixing messy column names
# We can rename multiple  data frame column names in the following manner
df = df.rename(columns = {'product:':'Product', 'product_parameters' : 'Product_Parameters', 'respective_price' : 'Respective_Price', 'ram': 'Ram','hdd_size': 'Hdd_Size', 'screen_size' : 'Screen_Size'})


In [None]:
df.head(58)

In [None]:
df[]

In [None]:
#df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_'),str.replace(' : ', ' ')
#df.columns

In [None]:
#for i in  df['Hdd size']:
#print(i)

## 1.5 Accuracy

In [None]:
#In-record & cross-datasets errors 
# drop row 56 since since there was an error during data entry
df = df.drop([df.index[52]])
df.tail(60)
