In [26]:
import pandas as pd
import numpy as np

## What is a pandas dataframe
A DataFrame is a 2-dimensional data structure that can store data of different types (including characters, integers, floating point values, categorical data and more) in columns. 

### What makes a Dataframe
Let's start from the very beginning, **Numpy arrays**.
<br>
*Numpy* arrays can only contain one type of data at a time, if you try to mix and match, let's see what happens

In [33]:
# Rough Area
a = np.array([1,2,3])
a[0]

####################
a = np.array([1,2,3])
a[0]

####################
b = np.array([1,2,'hi'])
b[0]

####################
c = np.array([1,2,True])
c[0]

1

### What is a Pandas Series
At the very core, Pandas Series are numpy arrays internally, with some functionality added on top.

Let's have a look

In [48]:
# Rough
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

In [49]:
np.array([1,2,3])

array([1, 2, 3])

## Let's stick the Pandas Series together
If you took a couple of pandas series objects of the same length, you will end up with a pandas dataframe

In [52]:
a = pd.Series([1,2,3])
b = pd.Series([4,5,6])

In [55]:
type(a)
type(b)

pandas.core.series.Series

In [61]:
a.index

RangeIndex(start=0, stop=3, step=1)

In [64]:
a.values

array([1, 2, 3])

In [69]:
a.to_frame

<bound method Series.to_frame of 0    1
1    2
2    3
dtype: int64>

In [71]:
df = pd.DataFrame({'a':a.values,'b':b.values})

In [74]:
df

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [76]:
type(df)

pandas.core.frame.DataFrame

In [78]:
type(df['a'])

pandas.core.series.Series

In [83]:
# Pay attention here
type(df[['a']])

pandas.core.frame.DataFrame

## Reading data with pandas
1. Reading with a csv
2. Reading from a dictionary



### Reading with a csv

In [85]:
df = pd.read_csv('dataset/CarPrice_Assignment.csv')

In [86]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


### A Pandas DataFrame is made up of Pandas Series stacked together
<img src="images/pandas_dataframe.png" alt="Picture" width="400"/>

With every row and column indexed, starting from 0

<img src="images/series.png" alt="Picture" width="400"/>


In [91]:
df['wheelbase'].max()
df['wheelbase'].min()

86.6

### Some basic statistics of the numerical data of my data

In [90]:
df.describe()

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,103.0,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,59.322565,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,1.0,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,52.0,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,103.0,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,154.0,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,205.0,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


## Reading/Writing

<img src="images/read_write_tabular_data.png" alt="Picture" width="800"/>

Pandas DataFrame. pandas supports many different file formats or data sources out of the box (csv, excel, sql, json, parquet, …), each of them with the prefix read_*.

In [93]:
# Viewing the heads and tails of the data
df.head()
df.head(10)
df.tail()
df.tail(3)

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.4,23.0,106,4800,26,27,22470.0
204,205,-1,volvo 264gl,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,19,25,22625.0


In [94]:
# A check on how pandas interpreted each of the column data types can be done by requesting the pandas dtypes attribute:
df.dtypes

car_ID                int64
symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object

### Saving to a xlsx

In [None]:
df.to_excel("carprice.xlsx", sheet_name="orignal_sheet", index=False)

In [96]:
# Technical summary of a DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

### Selecting subsets of the DataFrame
<img src="images/columns.png" alt="Picture" width="600"/>


<img src="images/rows.png" alt="Picture" width="600"/>

<img src="images/columns_rows.png" alt="Picture" width="600"/>

There are mainly 2 ways to do this.
- loc
- iloc

**'iloc'** is where, you select by indexes of rows and columns

Let's see some examples

In [103]:
df.iloc[0,0]

1

In [104]:
df.iloc[10,10]

176.8

In [107]:
df.iloc[:5,1]

0    3
1    3
2    1
3    2
4    2
Name: symboling, dtype: int64

In [109]:
df.iloc[:10,:2]

Unnamed: 0,car_ID,symboling
0,1,3
1,2,3
2,3,1
3,4,2
4,5,2
5,6,2
6,7,1
7,8,1
8,9,1
9,10,0


In [110]:
df.iloc[2:10,1:4]

Unnamed: 0,symboling,CarName,fueltype
2,1,alfa-romero Quadrifoglio,gas
3,2,audi 100 ls,gas
4,2,audi 100ls,gas
5,2,audi fox,gas
6,1,audi 100ls,gas
7,1,audi 5000,gas
8,1,audi 4000,gas
9,0,audi 5000s (diesel),gas


### The other apporach is the .loc

In [116]:
df.loc[1,'CarName']

'alfa-romero stelvio'

In [124]:
df['CarName']

0            alfa-romero giulia
1           alfa-romero stelvio
2      alfa-romero Quadrifoglio
3                   audi 100 ls
4                    audi 100ls
                 ...           
200             volvo 145e (sw)
201                 volvo 144ea
202                 volvo 244dl
203                   volvo 246
204                 volvo 264gl
Name: CarName, Length: 205, dtype: object

In [125]:
type(df['CarName'])

pandas.core.series.Series

In [123]:
df[['CarName','fueltype']]

Unnamed: 0,CarName,fueltype
0,alfa-romero giulia,gas
1,alfa-romero stelvio,gas
2,alfa-romero Quadrifoglio,gas
3,audi 100 ls,gas
4,audi 100ls,gas
...,...,...
200,volvo 145e (sw),gas
201,volvo 144ea,gas
202,volvo 244dl,gas
203,volvo 246,diesel


In [126]:
type(df[['CarName','fueltype']])

pandas.core.frame.DataFrame

## How to create plots in Pandas