# Data Prep Assignments

## Assignment 1: Set the Correct Row Granularity

In [2]:
import pandas as pd
import numpy as np

In [13]:
# 1. Read the Excel spreadsheet into a Pandas DataFrame
entertainment = pd.read_excel('../Data/entertainment.xlsx')
entertainment.head()

Unnamed: 0,name,entertainment,hours_per_week
0,Emily,video_games,5.1
1,Liam,video_games,4.9
2,Olivia,video_games,4.7
3,Noah,video_games,4.6
4,Ava,video_games,5.0


In [4]:
# 2. Check the number of rows and columns
entertainment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            600 non-null    object 
 1   entertainment   600 non-null    object 
 2   hours_per_week  595 non-null    float64
dtypes: float64(1), object(2)
memory usage: 14.2+ KB


In [5]:
# 3. Determine the row granularity needed
print(entertainment['name'].value_counts())
print(entertainment['entertainment'].value_counts())

name
Emily      4
Liam       4
Olivia     4
Noah       4
Ava        4
          ..
Jacob      4
Michael    4
Elijah     4
Daniel     4
Matthew    4
Name: count, Length: 150, dtype: int64
entertainment
video_games    150
tv_shows       150
movies         150
books          150
Name: count, dtype: int64


In [6]:
# 4. Apply the correct DataFrame transformation
# groupby
entertainment.groupby('name')['hours_per_week'].sum().reset_index()

Unnamed: 0,name,hours_per_week
0,Aaliyah,11.5
1,Abigail,10.7
2,Addison,11.6
3,Adeline,19.0
4,Alana,16.1
...,...,...
145,Winifred,22.1
146,Xanthe,22.9
147,Zara,25.6
148,Zoe,12.8


In [7]:
# pivot
entertainment.pivot(index ='name', columns='entertainment', values = 'hours_per_week').fillna(0).reset_index()


entertainment,name,books,movies,tv_shows,video_games
0,Aaliyah,0.5,1.5,4.6,4.9
1,Abigail,0.0,1.4,4.5,4.8
2,Addison,0.5,1.6,4.5,5.0
3,Adeline,3.5,4.4,4.5,6.6
4,Alana,2.8,3.9,3.8,5.6
...,...,...,...,...,...
145,Winifred,5.2,5.4,4.6,6.9
146,Xanthe,6.0,5.6,4.6,6.7
147,Zara,5.5,6.7,5.7,7.7
148,Zoe,0.0,1.5,6.1,5.2


In [8]:
# 5. Save the transformation as a new DataFrame
entertainment_pivot = entertainment.pivot(index ='name', columns='entertainment', values = 'hours_per_week').fillna(0).reset_index()


In [9]:
# 6. Check the number of rows and columns
entertainment_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         150 non-null    object 
 1   books        150 non-null    float64
 2   movies       150 non-null    float64
 3   tv_shows     150 non-null    float64
 4   video_games  150 non-null    float64
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [10]:
entertainment_pivot.shape

(150, 5)

## Assignment 2: Prepare Columns for Modeling

In [19]:

entertainment = entertainment_pivot.copy()

In [20]:
# 1. Find the missing values
entertainment.isna().any()

entertainment
name           False
books          False
movies         False
tv_shows       False
video_games    False
dtype: bool

In [None]:
# 2. Fill in the missing values with zeros

In [26]:
# 3. Create a new column called video_game_lover for people who played more than 7 hours of video games
entertainment['video_game_lover'] = np.where(entertainment['video_games']>7, 1, 0)
entertainment['video_game_lover']

0      0
1      0
2      0
3      0
4      0
      ..
145    0
146    0
147    1
148    0
149    0
Name: video_game_lover, Length: 150, dtype: int64

In [27]:
entertainment

entertainment,name,books,movies,tv_shows,video_games,video_game_lover
0,Aaliyah,0.5,1.5,4.6,4.9,0
1,Abigail,0.0,1.4,4.5,4.8,0
2,Addison,0.5,1.6,4.5,5.0,0
3,Adeline,3.5,4.4,4.5,6.6,0
4,Alana,2.8,3.9,3.8,5.6,0
...,...,...,...,...,...,...
145,Winifred,5.2,5.4,4.6,6.9,0
146,Xanthe,6.0,5.6,4.6,6.7,0
147,Zara,5.5,6.7,5.7,7.7,1
148,Zoe,0.0,1.5,6.1,5.2,0


## Assignment 3: Feature Engineering

In [31]:
# 1. Create a column called total_entertainment that sums up all the types of entertainment for each student
entertainment['total_entertainment']=entertainment[['books', 'movies', 'tv_shows', 'video_games']].sum(axis =1)

entertainment

entertainment,name,books,movies,tv_shows,video_games,video_game_lover,total_entertainment
0,Aaliyah,0.5,1.5,4.6,4.9,0,11.5
1,Abigail,0.0,1.4,4.5,4.8,0,10.7
2,Addison,0.5,1.6,4.5,5.0,0,11.6
3,Adeline,3.5,4.4,4.5,6.6,0,19.0
4,Alana,2.8,3.9,3.8,5.6,0,16.1
...,...,...,...,...,...,...,...
145,Winifred,5.2,5.4,4.6,6.9,0,22.1
146,Xanthe,6.0,5.6,4.6,6.7,0,22.9
147,Zara,5.5,6.7,5.7,7.7,1,25.6
148,Zoe,0.0,1.5,6.1,5.2,0,12.8


In [33]:
# 2. Create a column called pct_screen that calculates the percent of entertainment that’s on screens (everything except for books) for each student

entertainment['pct_screen'] = entertainment[['movies', 'tv_shows', 'video_games']].sum(axis =1) / entertainment['total_entertainment']

entertainment['pct_screen']



0      0.956522
1      1.000000
2      0.956897
3      0.815789
4      0.826087
         ...   
145    0.764706
146    0.737991
147    0.785156
148    1.000000
149    0.906977
Name: pct_screen, Length: 150, dtype: float64

## Assignment 4: Feature Selection

In [34]:
# 1. Save the student name column of the DataFrame as its own Series for reference
names = entertainment['name']
names

0       Aaliyah
1       Abigail
2       Addison
3       Adeline
4         Alana
         ...   
145    Winifred
146      Xanthe
147        Zara
148         Zoe
149        Zoey
Name: name, Length: 150, dtype: object

In [35]:
# 2. Save the three new columns of the DataFrame as its own DataFrame for modeling – video_game_lover, total_entertainment and pct_screen
entertainment_subset = entertainment[['video_game_lover', 'total_entertainment', 'pct_screen']]
entertainment_subset

entertainment,video_game_lover,total_entertainment,pct_screen
0,0,11.5,0.956522
1,0,10.7,1.000000
2,0,11.6,0.956897
3,0,19.0,0.815789
4,0,16.1,0.826087
...,...,...,...
145,0,22.1,0.764706
146,0,22.9,0.737991
147,1,25.6,0.785156
148,0,12.8,1.000000


In [42]:
# alternative 

entertainment.iloc[:, -3:] # : - all the rows, -3: - from the last 3 columns all the way back to the beginning

entertainment,video_game_lover,total_entertainment,pct_screen
0,0,11.5,0.956522
1,0,10.7,1.000000
2,0,11.6,0.956897
3,0,19.0,0.815789
4,0,16.1,0.826087
...,...,...,...
145,0,22.1,0.764706
146,0,22.9,0.737991
147,1,25.6,0.785156
148,0,12.8,1.000000


## Assignment 5: Feature Scaling

In [45]:
entertainment = entertainment.drop(columns= ['name'])

In [46]:
# 1. Scale the features in the modeling DataFrame so they all have a mean of 0 and a standard deviation of 1
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
std_scaler.fit_transform(entertainment)

array([[-1.30448273, -1.2833891 ,  0.02175834, ..., -0.29488391,
        -1.3600562 ,  1.38426176],
       [-1.56607686, -1.34022653, -0.13365838, ..., -0.29488391,
        -1.55159093,  1.91823515],
       [-1.30448273, -1.22655167, -0.13365838, ..., -0.29488391,
        -1.33611436,  1.38886498],
       ...,
       [ 1.31145858,  1.6721571 ,  1.73134223, ...,  3.39116499,
         2.01574343, -0.72034429],
       [-1.56607686, -1.2833891 ,  2.3530091 , ..., -0.29488391,
        -1.04881226,  1.91823515],
       [-0.93825095, -1.16971425,  0.48800849, ..., -0.29488391,
        -1.02487042,  0.77578046]])

In [49]:
# 2. Save the output as a final DataFrame that’s ready for modeling
entertainment_std = std_scaler.fit_transform(entertainment)
entertainment_std = pd.DataFrame(entertainment_std, columns=entertainment.columns)
entertainment_std.describe()

entertainment,books,movies,tv_shows,video_games,video_game_lover,total_entertainment,pct_screen
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0
mean,7.549517e-17,-2.575717e-16,-5.155136e-16,-4.23365e-16,-5.77316e-17,-1.000681e-15,-7.460699e-16
std,1.00335,1.00335,1.00335,1.00335,1.00335,1.00335,1.00335
min,-1.566077,-1.567576,-2.464909,-1.870024,-0.2948839,-1.743126,-1.574103
25%,-1.147526,-1.226552,-0.5999085,-0.9006812,-0.2948839,-1.018885,-0.7387959
50%,0.1081256,0.3364776,-0.1336584,-0.05250608,-0.2948839,0.0884252,-0.3422683
75%,0.7882703,0.7627583,0.4880085,0.6745011,-0.2948839,0.7109131,1.112733
max,1.67769,1.785832,3.130093,2.492019,3.391165,2.015743,1.918235


In [50]:
# 3. Optional: pickle the dataframe for modeling
entertainment_std.to_pickle('Entertainment_std.pkl')