In [1]:
import pandas as pd
import numpy as np
import os
import sys

In [2]:
from IPython.display import Image

In [3]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
import io

# Numpy Basics 

## Arrays 

In [4]:
Image(url='../Assets/Arrays.png', width=900, height=900)

## Basic Properties 

In [5]:
array_1d = np.array([10, 20, 30, 40])
print(f"Dimension: {array_1d.ndim}")
print(f"Shape: {array_1d.shape}")
print(f"Total Elements: {array_1d.size}")
print(f"Type: {array_1d.dtype}")

Dimension: 1
Shape: (4,)
Total Elements: 4
Type: int64


In [6]:
array_2d = np.array([[0.1, 0.5, 300, 400], [400, 0.7, 600, 0.6], [700, 0.5, 900, 0.6]])
print(f"Dimension: {array_2d.ndim}")
print(f"Shape: {array_2d.shape}")
print(f"Total Elements: {array_2d.size}")
print(f"Type: {array_2d.dtype}")

Dimension: 2
Shape: (3, 4)
Total Elements: 12
Type: float64


## Built in Functions to create arrays 

In [7]:
ones = np.ones(shape=10)
ones

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [8]:
zeros = np.zeros(shape=(10, 2))
zeros

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [9]:
array_constant = np.full(shape=(3,2), fill_value=10)
array_constant

array([[10, 10],
       [10, 10],
       [10, 10]])

In [10]:
array_arange = np.arange(start=10, stop=20, step=2)
array_arange

array([10, 12, 14, 16, 18])

In [11]:
array_random = np.random.random(size=(5,2))
array_random

array([[0.88654278, 0.75947461],
       [0.68756093, 0.62196963],
       [0.93153383, 0.44753164],
       [0.7848515 , 0.81875025],
       [0.25572362, 0.86488459]])

## Acessing Elements from arrays 

In [12]:
array = np.arange(20,30)
array

array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

### Accessing single element 

In [13]:
array[2]

22

### Using a list of indices 

In [14]:
array[[0,4,5]]

array([20, 24, 25])

###  Using a slice 

<p font-size=500>
    
    
<b> Slicing Syntax </b>
``` [start: stop :step] ```
    

* Default Values:
    * start - 0
    * stop = -1
    * step = 1
</p>

In [15]:
array[2:8:2], array[: 8:], array[:]

(array([22, 24, 26]),
 array([20, 21, 22, 23, 24, 25, 26, 27]),
 array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29]))

### Boolean Indexing

In [16]:
array[[False, False, True, True, True, False, False, False, False, False]]

array([22, 23, 24])

In [17]:
boolean_mask = array > 25
boolean_mask


array([False, False, False, False, False, False,  True,  True,  True,
        True])

In [18]:
array[boolean_mask]

array([26, 27, 28, 29])

## Returning a View vs Copy 

In [19]:
array = np.arange(10)

array_index_list = array[[1,2,3,4,5,6]]
array_slice = array[1:7]

print('original array', array)
print('array Slice', array_slice)
print('array index list', array_index_list)

original array [0 1 2 3 4 5 6 7 8 9]
array Slice [1 2 3 4 5 6]
array index list [1 2 3 4 5 6]


In [20]:
array_slice[0] = -200
print('original array', array)
print('array slice', array_slice)
print('array index list', array_index_list)

original array [   0 -200    2    3    4    5    6    7    8    9]
array slice [-200    2    3    4    5    6]
array index list [1 2 3 4 5 6]


In [22]:
array_index_list[0] = -400
print('original array', array)
print('array slice', array_slice)
print('array index list', array_index_list)

original array [   0 -200    2    3    4    5    6    7    8    9]
array slice [-200    2    3    4    5    6]
array index list [-400    2    3    4    5    6]


# Pandas Basics

## Series

In [25]:
Image(url='../Assets/series.png', width=900, height=900)

### Common Methods and Attributes


* pd.Series.values
* pd.Series.index



* pd.Series.mean()
* pd.Series.median()




* pd.Series.isna()
* pd.Series.dropna()


In [50]:
series = pd.Series([100,200,700,900, 130], index=['a','c','e','z', 't'])

In [None]:
series

In [None]:
series.index, series.values

In [None]:
series.mean(), series.median()

In [None]:
s1 = pd.Series([1,np.nan, 2,3,np.nan], index=['a','b','c','d','e'])
s1

In [None]:
s1.isna()

In [None]:
s1.dropna()

###  Selecting Data  

In [None]:
s = pd.Series([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], index=['a','b','c','d','e','f','g', 'h', 'i','j'])
s

#### Methods to avoid

In [None]:
s['a']

In [None]:
s[0]

In [None]:
s1 = pd.Series([1,2,3,4], index=[10, 20, 0,  100])
s1

In [None]:
s1[0]

In [None]:
s2 = pd.Series([10, 20, 30, 40], index=[5,6,7,8])
# s2[0]

####  Position Based Indexing 


`Syntax :  series_object.iloc[ <selection> ]  `

 `<selection> can be`

 - Single Position - 10
 - List of Positions - [0, 4, 10]
 - Slices of positions - 0 : 10
     - Empty Slice " : "  means all positions from 0 to -1

In [None]:
s2.iloc[0]

In [None]:
s2.iloc[[1,2]]

In [None]:
s2.iloc[1:]

####  Label Based Indexing 

`Syntax :  series_object.loc[ <selection> ]  `

 `<selection> can be`

 - Single Label - 'a'
 - List of Labels -  `[ 'a', 'b', 'd']`
 - Slices of Labels - 'a' : 'd'
     - Empty Slice " : "  means all positions from first label to last label 

In [None]:
s3 = pd.Series([1,2,3,4,5], index=['a','b','c','d','e'])
s3


In [None]:
s3.loc['b']

In [None]:
s3.loc[['a','d']]

In [None]:
s3.loc['a': 'd']

#### Boolean Indexing

In [None]:
s4 = pd.Series([100, 300, 400, 500, 600], index=['a','z','b','k','n'])
s4

In [None]:
s4.loc[[True, False, True, False, True]]

In [None]:
mask = s4 >= 400
mask

In [None]:
s4.loc[mask]

## DataFrame 

In [None]:
Image(url='../Assets/dataframe.png', width=900, height=900)

In [None]:
data_frame = pd.DataFrame(np.arange(20).reshape(5,4), columns=['A','B','C','D'], index=['p','q','r','s','t'])

In [None]:
data_frame

In [None]:
data_frame.columns, data_frame.index

In [None]:
data_frame.values

In [None]:
row_1 =  data_frame.loc['r', :]
row_1

In [None]:
type(row_1)

In [None]:
col_1 =data_frame.loc[:, 'A']
col_1

In [None]:
type(col_1)

### Selecting Data 

In [None]:
data_frame = pd.DataFrame(np.arange(20,40).reshape(5,4), columns=['A','B','C','D'], index=['a','b','c','d','e'])

In [None]:
data_frame

#### Methods to avoid 

In [None]:
data_frame['A']

In [None]:
data_frame[['A','B','C']]

In [None]:
data_frame[0:2]

In [None]:
data_frame['a':'b']

#### Position Based Indexing

`Syntax :  data_frame.iloc[ <row_selection>, <column_selection> ]  `

 `<row_selection> and <column_selection> can be`

 - Single Position - 10
 - List of Positions - [0, 4, 10]
 - Slices of positions - 0 : 10
     - Empty Slice " : "  means all positions from 0 to -1

In [None]:
data_frame

In [None]:
data_frame.iloc[[0, 1, 3] , 0: 3]

#### Label Based Indexing 

`Syntax :  data_frame.loc[ <row_selection>, <column_selection> ]  `

 `<row_selection> and <column_selection> can be`

 - Single Label - 'a'
 - List of Labels -  `[ 'a', 'b', 'd']`
 - Slices of Labels - 'a' : 'd'
     - Empty Slice " : "  means all positions from first label to last label

In [None]:
data_frame

In [None]:
data_frame.loc[:, ['A','B','D']]

In [None]:
data_frame.loc[['a','d'], 'A':'C']

####  Combining Position and Label based indexing 

In [None]:
data_frame

In [None]:
data_frame.iloc[[0,1], data_frame.columns.get_indexer(['A','C'])]

In [None]:
data_frame.loc[data_frame.index[[0, 1]], ['A','C']]

#### Boolean Indexing/ Logical Selection

`Syntax :  data_frame.loc[ <row_selection>, <column_selection> ]  where`

     - <row_selection> can be a boolean array/list or series with same number of elements as number of rows in 
       dataframe
     - <column_selection> can be a boolean array/list or series with same number of elements as number of column in 
       dataframe


In [None]:
data_frame

In [None]:
data_frame.loc[[True, False, True, True, True], [False, False, True, True]]

In [None]:
data_frame

In [None]:
mask1 = data_frame['A'] > 25

In [None]:
mask1

In [None]:
data_frame.loc[mask1, [False, False, True, True]]

In [None]:
data_frame.loc[mask1, :]

In [None]:
mask2 = data_frame['A'] > 30
mask3 = data_frame['B'] > 28

In [None]:
mask2

In [None]:
mask3

In [None]:
mask2 | mask3

In [None]:
data_frame.loc[mask2 | mask3, :]

In [None]:
data_frame

### Setting a value

In [None]:
data_frame = pd.DataFrame(np.arange(20,40).reshape(5,4), columns=['A','B','C','D'], index=['a','b','c','d','e'])
data_frame

In [None]:
data_frame.loc[['a','b','c','d'], ['D']] = 0 
data_frame

# Things to keep in Mind 

## Returning a View Vs Copy

### Returning a View vs Copy Numpy

In [None]:
array = np.arange(10)
array_view = array[1:7]
array_copy = array[[1,2,3,4,5,6]]

print('original array', array)
print('array view', array_view)
print('array copy', array_copy)

In [None]:
array_view[0] = -200
print('original array', array)
print('array view', array_view)
print('array copy', array_copy)

In [None]:
array_copy[0] = -400
print('original array', array)
print('array view', array_view)
print('array copy', array_copy)

### Returning a View Vs Copy Pandas 

In [None]:
data_frame = pd.DataFrame(np.arange(100,120).reshape(5,4), columns=['A','B','C','D'], index=['a','b','c','d','e'])
data_frame_view = data_frame.loc['a':'e', :]
data_frame_copy = data_frame.loc[['a','b','c','d','e'], ['A','B','C','D']]

print('----------------------------')
print('Original Data Frame')
print(data_frame)
print('----------------------------')
print('Data Frame View')
print(data_frame_view)
print('----------------------------')
print('Data Frame Copy')
print(data_frame_copy)
print('----------------------------')

In [None]:
data_frame_view.loc[:, 'D'] = 0
print('----------------------------')
print('Original Data Frame')
print(data_frame)
print('----------------------------')
print('Data Frame View')
print(data_frame_view)
print('----------------------------')
print('Data Frame Copy')
print(data_frame_copy)
print('----------------------------')

In [None]:
data_frame_copy.loc[:, 'A'] = -100
print('----------------------------')
print('Original Data Frame')
print(data_frame)
print('----------------------------')
print('Data Frame View')
print(data_frame_view)
print('----------------------------')
print('Data Frame Copy')
print(data_frame_copy)
print('----------------------------')

## Pandas always aligns on index for series and dataframe 

### Arrays Addition

In [None]:
Image(url='../Assets/ArraysAddition.png', width=900, height=900)

In [None]:
a1 = np.array([10,20,30,40,50])
b1 = np.array([100, 200, 300, 400, 500])
a1 + b1

### Series Addition 

In [None]:
Image(url='../Assets/SeriesAddition.png', width=900, height=900)

In [None]:
s1 = pd.Series([10, 20, 30, 40,50], index=['a','b','c','d','e'])
s2 = pd.Series([100, 200, 300, 400, 500], index=['a', 'c', 'b', 'd', 'e'])

In [None]:
s1

In [None]:
s2

In [None]:
s1 + s2

In [None]:
s1 = pd.Series([1, 2, 3, 4,5], index=['a','b','c','d','e'])
s2 = pd.Series([10, 11, 12, 13, 15], index=['a', 'b', 'e', 'c', 'f'])
s1 + s2

### Columns assignment 

In [None]:
df1 = pd.DataFrame(np.arange(20).reshape(5,4), columns=['A','B','C','D'], index=['a','b','c','d','e'])
df2 = pd.DataFrame(np.arange(10).reshape(5,2), columns=['E', 'F'], index=['a','b','d','c','e'])

In [None]:
df1

In [None]:
df2

In [None]:
Image(url='../Assets/PandasColumnsIndexing.png', width=900, height=900)

In [None]:
df1.loc[:, 'E'] = df2['E']

In [None]:
df1

# Matplotlib Basics 

## Anatomy of a matplotlib figure 

In [None]:
Image(url='../Assets/anatomy.png', width=900, height=900)

## Fig Vs Axes 

In [None]:
Image(url='../Assets/subplots.png', width=900, height=900)

In [None]:
Image(url='../Assets/seaborn_basics.png', width=900, height=900)

### Line Plot 

In [None]:
fig,axes = plt.subplots(nrows=1, ncols=1, figsize=(15,10))
x = np.arange(100)
y = np.random.random(100)
axes.plot(x,y,marker='o')

### Scatter Plot 

In [None]:
fig,axes = plt.subplots(nrows=1, ncols=1, figsize=(15,10))
x = np.arange(100)
y = np.arange(200,300)
axes.scatter(x,y)


## Multiple Subplots 

In [None]:
fig,axes = plt.subplots(nrows=2, ncols=3, figsize=(15,10), sharey=True)
print(type(axes), axes.shape)


In [None]:
fig,axes = plt.subplots(nrows=2, ncols=3, figsize=(15,10), sharey=True)
axes[0][0].plot([1,2,3])
axes[1][2].plot([6,7,8])
axes[0][1].set_title('This is Axes  0 1')

## Common Modifications using Axes reference 

```
Axes.set_xlabel()         Axes.set_ylabel()
Axes.set_xlim()           Axes.set_ylim()
Axes.set_xticks()         Axes.set_yticks()
Axes.set_xticklabels()    Axes.set_yticklabels()
Axes.set_title()
Axes.tick_params()
```

In [None]:
dates = [
    '1981-01-01', '1981-01-02', '1981-01-03', '1981-01-04', '1981-01-05',
    '1981-01-06', '1981-01-07', '1981-01-08', '1981-01-09', '1981-01-10'
]

min_temperature = [20.7, 17.9, 18.8, 14.6, 15.8, 15.8, 15.8, 17.4, 21.8, 20.0]
max_temperature = [34.7, 28.9, 31.8, 25.6, 28.8, 21.8, 22.8, 28.4, 30.8, 32.0]

fontsize =20
fig,axes = plt.subplots(nrows=1, ncols=1,figsize=(15,10))
axes.plot(dates, min_temperature, label='Min Temperature')
axes.plot(dates, max_temperature, label='Max Temperature')


axes.set_xlabel('Date',fontsize=fontsize)
axes.set_ylabel('Temperature',fontsize=fontsize)

axes.set_title('Daily Min and Max Temperature',fontsize=fontsize)


axes.tick_params('x', labelsize=fontsize, labelrotation=45, size=15)


axes.set_ylim(10,40)
axes.set_yticks(np.arange(10,41,2))
axes.tick_params('y',labelsize=fontsize)

axes.legend(fontsize=fontsize,loc='upper left', bbox_to_anchor=(1,1))

# Try it Yourself 

In [None]:
iris_dataset = '../Datasets/iris.csv'
tips_dataset = '../Datasets/tips.csv'
tips = pd.read_csv(tips_dataset)
iris = pd.read_csv(iris_dataset)

In [None]:
tips.head(10)

In [None]:
tips.tail()

## Questions 

### Fetch all column names from tips dataset

### Fetch only the following columns

* tip
* smoker
* time

### Fetch columns at position 0, 3, 5

### Fetch all rows where tip is greater than 5

###  Fetch all rows where tip is less than 5 and time is 'lunch'

###  
* Fetch all rows where the person is a smoker and time is "Dinner"
* From rows fetched in above step, select rows at position [10, 20, 30]

###  Scatter Plot


* Make a scatter plot with tip on x-axis and total_bill on y-axis
* Add appropriate labels on axis
* Add title to your plot

###  Scatter Plot with legends and labels

* Fetch all rows for 'Male'
* Fetch all rows for 'Female'

* Make a single scatter plot with tip on x-axis and total_bill on y-axis for Male
* Make a single scatter plot with tip on x-axis and total_bill on y-axis for Female
* Add appropriate lables and legend

## Solutions 

### .

In [None]:
tips.columns

### . 

In [None]:
tips.loc[:, ['tip','smoker','time']]

### . 

In [None]:

tips.iloc[:, [0,3,5]]

### . 

In [None]:
c1 = tips['tip'] > 5
tips.loc[c1]

### . 

In [None]:
c1 = tips['tip'] < 5
c2 = tips['time'] == 'Lunch'
tips.loc[c1 & c2, :]

### . 

In [None]:
c1 = tips['smoker'] == 'Yes'
c2 = tips['time'] == 'Dinner'
tips.loc[c1 & c2].iloc[[10,20,30]]

In [None]:
fig,axes = plt.subplots(figsize=(15,10))
axes.scatter(tips['tip'].values, tips['total_bill'].values)
axes.set_xlabel('tip')
axes.set_ylabel('total_bill')
axes.set_title('Tip Vs Total Bill')

In [None]:
c1 = tips['sex'] == 'Male'
c2 = tips['sex'] == 'Female'

male_data = tips.loc[c1]
female_data = tips.loc[c2]

fig,axes = plt.subplots(figsize=(15,10))
axes.scatter(male_data['tip'].values, male_data['total_bill'].values, label='Male')
axes.scatter(female_data['tip'].values, female_data['total_bill'].values, label='Female')
axes.legend()
axes.set_xlabel('tip')
axes.set_ylabel('total_bill')