In [1]:
#Import Python Libraries
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import seaborn as sns

In [2]:
#Read tsv file
df_orig = pd.read_csv("./Video_Store.tsv", sep='\t')
df = df_orig

In [3]:
#Quick glance of the top 5 results
df.head()

Unnamed: 0,Cust ID,Gender,Income,Age,Rentals,Avg Per Visit,Incidentals,Genre
0,1,M,"$45,000",25,27,2.5,Yes,Action
1,2,F,"$54,000",33,12,3.4,No,Drama
2,3,F,"$32,000",20,42,1.6,No,Comedy
3,4,F,"$59,000",70,16,4.2,Yes,Drama
4,5,M,"$37,000",35,25,3.2,Yes,Action


In [4]:
#See the names of all columns
df.columns

Index(['Cust ID', 'Gender', 'Income', 'Age', 'Rentals', 'Avg Per Visit',
       'Incidentals', 'Genre'],
      dtype='object')

In [5]:
#Count how many features there are
len(df.columns)

8

1. How many features are there?

There are 8 features.

In [6]:
#See what the types of all features are
df.dtypes

Cust ID            int64
Gender            object
Income            object
Age                int64
Rentals            int64
Avg Per Visit    float64
Incidentals       object
Genre             object
dtype: object

In [7]:
#Referencing the head of this table, there is one discrepancy that needs to be addressed

#Income needs to be converted from a string to a numeric format
#Remove $ signs
df['Income'] = df['Income'].str.replace('$', '')
#Remove ,
df['Income'] = df['Income'].str.replace(',', '')
#Convert to type float
df['Income'] = df['Income'].astype(float)

In [8]:
#Count how many types are objects, and how many are numeric (int64 or float64)
df.dtypes.value_counts()

int64      3
object     3
float64    2
dtype: int64

2. How many features are continuous, and how many are nominal?

At first glance, this would suggest 5 continuous features and 3 nominal features; however, Cust ID is a nominal feature. So, 4 features are continuous and 4 features are nominal

In [9]:
df.describe()

Unnamed: 0,Cust ID,Income,Age,Rentals,Avg Per Visit
count,50.0,50.0,50.0,50.0,50.0
mean,25.5,42300.0,31.56,26.24,2.748
std,14.57738,21409.753642,12.000272,10.027635,0.898125
min,1.0,1000.0,15.0,9.0,1.1
25%,13.25,26750.0,22.0,19.0,2.125
50%,25.5,41000.0,30.0,25.0,2.75
75%,37.75,56750.0,37.5,32.75,3.375
max,50.0,89000.0,70.0,48.0,4.7


3. For the continuous features, what are the average, median, maximum, and minimum values? What is the standard deviation?

Feature       | Average | Median | Maximum | Minimum | Standard Deviation 
--------------|---------|--------|---------|---------|-------------------
Income        | 42,300  | 41,000 | 89,000  |  1,000  | 21,410
Age           | 31.6    |   30   |   70    |   15    | 12
Rentals       | 26.2    |   25   |   48    |    9    | 10
Avg Per Visit | 2.7     |  2.8   |   4.7   |   1.1   | 0.9 

In [10]:
#Discretize Income by category High = 60k +, Mid = 25-59k, low = <25k

# create a list of conditions
conditions = [
    (df['Income'] < 25000),
    (df['Income'] > 25000) & (df['Income'] < 60000),
    (df['Income'] >= 60000),
    ]

# create a list of the values we want to assign for each condition
values = ['Low', 'Mid', 'High']

# create a new column and use np.select to assign values to it using our lists as arguments
df['Discretized Income'] = np.select(conditions, values)

# display updated DataFrame
df.head()

Unnamed: 0,Cust ID,Gender,Income,Age,Rentals,Avg Per Visit,Incidentals,Genre,Discretized Income
0,1,M,45000.0,25,27,2.5,Yes,Action,Mid
1,2,F,54000.0,33,12,3.4,No,Drama,Mid
2,3,F,32000.0,20,42,1.6,No,Comedy,Mid
3,4,F,59000.0,70,16,4.2,Yes,Drama,Mid
4,5,M,37000.0,35,25,3.2,Yes,Action,Mid


In [11]:
#Save file to Video_Store2.tsv
df.to_csv("./Video_Store2.tsv", sep ='\t', index = False)