#### 18.2.2: Pandas Refresher

In [9]:
# Import the Pandas library 
import pandas as pd 

# File to load
file_path = "Resources/iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [10]:
# Drop the class form dataframe for Unsupervised Learning 
#new_iris_df2 = iris_df.drop(["class"], axis=1)
#new_iris_df2.head()

# Drop the class form dataframe for Unsupervised Learning 
new_iris_df = iris_df.drop(columns="class", axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [11]:
# Rearrange the columns so sepal and petal lengths are first
# widths are last two column
# Use reindex()
new_cols = ["sepal_length", "petal_length", "sepal_width", "petal_width"]
new_iris_df = new_iris_df.reindex(columns = new_cols)
new_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [12]:
# convert DataFrame to csv 
output_file_path = "Resources/new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

#### 18.2.3: Preprocessing Data with Pandas

In [30]:
# Data loading
file_to_load = "Resources/shopping_data.csv"
df_shopping = pd.read_csv(file_to_load)
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [14]:
# use columns method and output the columns 
# Columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [15]:
# To confirm the datatypes of the column 
df_shopping.dtypes 

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [25]:
df_shopping.isnull().sum()

CustomerID                0
Card Member               2
Age                       2
Annual Income             0
Spending Score (1-100)    1
dtype: int64

In [23]:
# To check any missing data with isnull() method
# Find null values
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [26]:
# Drop null rows 
df_shopping = df_shopping.dropna()

In [27]:
# Find duplicate entries
print(f"Duplicate entries: {df_shopping.duplicated().sum()}")

Duplicate entries: 0


In [31]:
# Remove the CustomerID column
df_shopping.drop(columns=["CustomerID"], inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [33]:
# To convert Card Member string column to Numerical column
# use .apply() method 
# Tranform String column 
def change_string(member):
    if member == "Yes":
        return 1
    else: 
        return 0
    
df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [70]:
# Rescale some column which has larger than the other values in the dataset in columns
# Readjust it by dividing a number to rescale those data points
# Transform Annual Income column 
df_shopping["Annual Income"] = df_shopping["Annual Income"]/1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,0.015,39.0
1,1,21.0,0.015,81.0
2,0,20.0,0.016,6.0
3,0,23.0,0.016,77.0
4,0,31.0,0.017,40.0


In [69]:
# Reformat the names of the columns so they have no sapes or number 
# names_col = df_shopping.columns
# def names(names_col):
#     for i in names_col:
#         new_names = i.replace(" ", "_")
#         print(new_names)

# # df_shopping.columns = df_shopping.columns.

# names(names_col)

Card_Member
Age
Annual_Income
Spending_Score_(1-100)
