# Overall

## Libraries

In [1]:
import pandas as  pd
import numpy

## Data 

## 2. Data Preparation Basics

### Intro into data Prep:


1. Import data

2. Data cleaning involves: 
  -  Removing Duplicates
  - Removing Out-of-Range Records
  - Removing Stray Characters
  - Standardizing Casing

3. Transforming Data
  - Treating Missing Values
  - Scaling and Normalizing Variables
    - Its important to scale data:
      - So that differing magjnitudes among variables do not produce erroneous or misleading statistics
      - To prepare your data for machine learning
    - Two ways to Scale:
      - Normalization
        - Putting each observation on a relatice scale betweeen the values of 0 and 1
        - Value of Observation / Sum of All Observations in Variable
      - Standardization
        - Rescaling data so that is has a zero mean and unit variance
4. Processing Data
  - Parsing data
  - Recoding data
  - Reformating data
5. Logging your data
  - Generate Descriptive statistics
  - Logging variable information
    - Log variable information, including:
      - Variable name and statistical description
      - Data format (number, data, text)
      - Method of data collection
      - Date of data collection
      - Data source
      - Location where data is stored
      - Other notes
  - Storing variable information
6. Backing Up your data
  - Create a backup copy
  - Begin analysis of data



### Treating Missing Values

In [2]:
import numpy as np
import pandas as pd

from pandas import DataFrame

In [3]:
data = {'names':['steve','john','richard','sarah','randy','micheal','julie'],
        'age':[20,22,20,21,24,23,22],
        'gender':['Male','Male','Male','Female','Male','Male','Female'],
        'rank':[2,1,4,5,3,7,6]}

ranking_df = DataFrame(data)
ranking_df.iloc[2:5,1]= np.nan
ranking_df.iloc[3:6,3]= np.nan
ranking_df.iloc[3,:]= np.nan
ranking_df

Unnamed: 0,names,age,gender,rank
0,steve,20.0,Male,2.0
1,john,22.0,Male,1.0
2,richard,,Male,4.0
3,,,,
4,randy,,Male,
5,micheal,23.0,Male,
6,julie,22.0,Female,6.0


In [8]:
ranking_df.isnull() #.notnull() can also work

Unnamed: 0,names,age,gender,rank
0,False,False,False,False
1,False,False,False,False
2,False,True,False,False
3,True,True,True,True
4,False,True,False,True
5,False,False,False,True
6,False,False,False,False


In [9]:
#boolean masking
#in this example we see the rows where the age is missing
bool_series = pd.isnull(ranking_df['age'])
ranking_df[bool_series]

Unnamed: 0,names,age,gender,rank
2,richard,,Male,4.0
3,,,,
4,randy,,Male,


In [10]:
ranking_df.fillna(0)

Unnamed: 0,names,age,gender,rank
0,steve,20.0,Male,2.0
1,john,22.0,Male,1.0
2,richard,0.0,Male,4.0
3,0,0.0,0,0.0
4,randy,0.0,Male,0.0
5,micheal,23.0,Male,0.0
6,julie,22.0,Female,6.0


In [11]:
ranking_df.fillna(method='pad')
#missing values are replaced with the value that came prior

  ranking_df.fillna(method='pad')


Unnamed: 0,names,age,gender,rank
0,steve,20.0,Male,2.0
1,john,22.0,Male,1.0
2,richard,22.0,Male,4.0
3,richard,22.0,Male,4.0
4,randy,22.0,Male,4.0
5,micheal,23.0,Male,4.0
6,julie,22.0,Female,6.0


In [12]:
ranking_df.fillna(method='bfill')
#missing values are replaced with the value that came after

  ranking_df.fillna(method='bfill')


Unnamed: 0,names,age,gender,rank
0,steve,20.0,Male,2.0
1,john,22.0,Male,1.0
2,richard,23.0,Male,4.0
3,randy,23.0,Male,6.0
4,randy,23.0,Male,6.0
5,micheal,23.0,Male,6.0
6,julie,22.0,Female,6.0


In [13]:
ranking_df.interpolate(method = 'linear')
#for each of the missing numerical values, they are filled with linear interpolation

  ranking_df.interpolate(method = 'linear')


Unnamed: 0,names,age,gender,rank
0,steve,20.0,Male,2.0
1,john,22.0,Male,1.0
2,richard,22.25,Male,4.0
3,,22.5,,4.5
4,randy,22.75,Male,5.0
5,micheal,23.0,Male,5.5
6,julie,22.0,Female,6.0


In [14]:
ranking_df.dropna()

Unnamed: 0,names,age,gender,rank
0,steve,20.0,Male,2.0
1,john,22.0,Male,1.0
6,julie,22.0,Female,6.0


In [15]:
ranking_df.dropna(how = 'all')
#this only drops rows where all values are missing

Unnamed: 0,names,age,gender,rank
0,steve,20.0,Male,2.0
1,john,22.0,Male,1.0
2,richard,,Male,4.0
4,randy,,Male,
5,micheal,23.0,Male,
6,julie,22.0,Female,6.0


In [16]:
#drop all columns with missing value
ranking_df.dropna(axis=1)

0
1
2
3
4
5
6


In [17]:
ranking_df.dropna(axis=0)

Unnamed: 0,names,age,gender,rank
0,steve,20.0,Male,2.0
1,john,22.0,Male,1.0
6,julie,22.0,Female,6.0


### Removing Duplicates

In [18]:
DF_obj = DataFrame({'column 1': [1,1,2,2,3,3,3],
                    'column 2':['a', 'a', 'b', 'b', 'c', 'c', 'c'],
                    'column 3': ['A', 'A', 'B', 'B', 'C', 'C', 'C']})
DF_obj

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
1,1,a,A
2,2,b,B
3,2,b,B
4,3,c,C
5,3,c,C
6,3,c,C


In [19]:
DF_obj.duplicated()

0    False
1     True
2    False
3     True
4    False
5     True
6     True
dtype: bool

In [20]:
DF_obj.drop_duplicates()

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
2,2,b,B
4,3,c,C


In [21]:
DF_obj = DataFrame({'column 1': [1,1,2,2,3,3,3],
                    'column 2':['a', 'a', 'b', 'b', 'c', 'c', 'c'],
                    'column 3': ['A', 'A', 'B', 'B', 'C', 'D', 'C']})

DF_obj

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
1,1,a,A
2,2,b,B
3,2,b,B
4,3,c,C
5,3,c,D
6,3,c,C


In [22]:
DF_obj.drop_duplicates(['column 3'])

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
2,2,b,B
4,3,c,C
5,3,c,D


### Concatenating and Transforming

In [None]:
DF_obj = DataFrame(np.arange(36).reshape(6,6))
DF_obj

In [None]:
DF_obj_2 = DataFrame(np.arange(15).reshape(5,3))
DF_obj_2

In [None]:
pd.concat([DF_obj, DF_obj_2], axis=1)

In [None]:
pd.concat([DF_obj, DF_obj_2])

#### dropping data

In [None]:
DF_obj.drop([0,2])

In [None]:
DF_obj.drop([0,2], axis=1)

#### Adding data

In [None]:
series_obj = Series(np.arange(6))
series_obj.name = "added_variable"
series_obj

In [None]:
variable_added = DataFrame.join(DF_obj,series_obj)
variable_added

In [None]:
added_datatable = pd.concat([variable_added, variable_added], ignore_index=False)
added_datatable

In [None]:
added_datatable = pd.concat([variable_added, variable_added], ignore_index=True)
added_datatable

#### Sorting

In [None]:
DF_sorted = DF_obj.sort_values(by=[5],ascending=[False])
DF_sorted

### Grouping and aggregation

In [23]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [24]:
address = '/workspaces/python-for-data-science-and-machine-learning-essential-training-part-1-3006708/data/mtcars.csv'

cars = pd.read_csv(address)

cars.columns = ['car_names','mpg','cyl','disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']

cars.head()

Unnamed: 0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [25]:
cars_groups = cars.groupby(cars['cyl'])
cars_groups.mean(numeric_only=True)

Unnamed: 0_level_0,mpg,disp,hp,drat,wt,qsec,vs,am,gear,carb
cyl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4,26.663636,105.136364,82.636364,4.070909,2.285727,19.137273,0.909091,0.727273,4.090909,1.545455
6,19.742857,183.314286,122.285714,3.585714,3.117143,17.977143,0.571429,0.428571,3.857143,3.428571
8,15.1,353.1,209.214286,3.229286,3.999214,16.772143,0.0,0.142857,3.285714,3.5


## 3. Data Visualization 101

In [None]:
#this has no code in the tutorial

## 4. Practical Data Visualization

In [None]:
#check notebooks 4_1-4_7

## 5. Exploratory Data Analysis

## 6. Getting Started with Machine Learning

## 7. Data Sourcing via Web Scraping

## 8. Collaborative Analytics with Streamlit