# Pandas Tutorial

## Learning by Examples

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# print(df.to_string())
 

# Pandas Getting Started

## Import Pandas

In [None]:
import pandas

mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pandas.DataFrame(mydataset)

print(myvar)


    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


## Pandas as pd

In [None]:
import pandas as pd

mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


## Checking Pandas Version

In [None]:
import pandas as pd

print(pd.__version__)


1.3.5


# Pandas Series

## What is a Series?

In [None]:
import pandas as pd

a = [1, 7, 2]

myvar = pd.Series(a)

print(myvar)

0    1
1    7
2    2
dtype: int64


## Labels

In [None]:
import pandas as pd

a = [1, 7, 2]

myvar = pd.Series(a)

print(myvar[0])


1


## Create Labels

In [None]:
import pandas as pd

a = [1, 7, 2]

myvar = pd.Series(a, index = ["x", "y", "z"])

print(myvar)


x    1
y    7
z    2
dtype: int64


In [None]:
import pandas as pd

a = [1, 7, 2]

myvar = pd.Series(a, index = ["x", "y", "z"])

print(myvar["y"])


7


## Key/Value Objects as Series

In [None]:
import pandas as pd

calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories)

print(myvar)


day1    420
day2    380
day3    390
dtype: int64


In [None]:
import pandas as pd

calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories, index = ["day1", "day2"])

print(myvar)


day1    420
day2    380
dtype: int64


## DataFrames

In [None]:
import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

myvar = pd.DataFrame(data)

print(myvar)


   calories  duration
0       420        50
1       380        40
2       390        45


# Pandas DataFrames

## What is a DataFrame?

In [None]:
import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df) 

   calories  duration
0       420        50
1       380        40
2       390        45


## Locate Row

In [None]:
import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df.loc[0])


calories    420
duration     50
Name: 0, dtype: int64


In [None]:
import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df.loc[[0, 1]])


   calories  duration
0       420        50
1       380        40


## Named Indexes

In [None]:
import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])

print(df) 


      calories  duration
day1       420        50
day2       380        40
day3       390        45


## Locate Named Indexes

In [None]:
import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])

print(df.loc["day2"])


calories    380
duration     40
Name: day2, dtype: int64


## Load Files Into a DataFrame

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# print(df)


# Pandas Read CSV

## Read CSV Files

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# print(df.to_string())


In [None]:

# import pandas as pd
# ​
# df = pd.read_csv('data.csv')
# ​
# print(df)
# ​

## max_rows

In [None]:
# import pandas as pd
# ​
# print(pd.options.display.max_rows)
# ​


In [None]:
# import pandas as pd
# ​
# pd.options.display.max_rows = 9999
# ​
# df = pd.read_csv('data.csv')
# ​
# print(df)
# ​


# Pandas Read JSON

## Read JSON

In [None]:
# import pandas as pd
# ​
# df = pd.read_json('data.json')
# ​
# print(df.to_string())
# ​


## Dictionary as JSON

In [None]:
import pandas as pd

data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)

print(df) 

   Duration  Pulse  Maxpulse  Calories
0        60    110       130       409
1        60    117       145       479
2        60    103       135       340
3        45    109       175       282
4        45    117       148       406
5        60    102       127       300


# Pandas - Analyzing DataFrames

## Viewing the Data

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# print(df.head(10))


In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# print(df.head())


In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# print(df.tail())


## Info About the Data

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# print(df.info())


# Cleaning Data

## Pandas - Cleaning Empty Cells

### Remove Rows

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# new_df = df.dropna()

# print(new_df.to_string())

# #Notice in the result that some rows have been removed (row 18, 22 and 28).

# #These rows had cells with empty values.


In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# df.dropna(inplace = True)

# print(df.to_string())

# #Notice in the result that some rows have been removed (row 18, 22 and 28).

# #These rows had cells with empty values.


### Replace Empty Values

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# df.fillna(130, inplace = True)

# print(df.to_string())

# #Notice in the result: empty cells got the value 130 (in row 18, 22 and 28).


### Replace Only For Specified Columns

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# df["Calories"].fillna(130, inplace = True)

# print(df.to_string())

# #This operation inserts 130 in empty cells in the "Calories" column (row 18 and 28).


### Replace Using Mean, Median, or Mode

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# x = df["Calories"].mean()

# df["Calories"].fillna(x, inplace = True)

# print(df.to_string())

# #As you can see in row 18 and 28, the empty values from "Calories" was replaced with the mean: 304.68


In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# x = df["Calories"].median()

# df["Calories"].fillna(x, inplace = True)

# print(df.to_string())

# #As you can see in row 18 and 28, the empty values from "Calories" was replaced with the median: 291.2


In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# x = df["Calories"].mode()[0]

# df["Calories"].fillna(x, inplace = True)

# print(df.to_string())

# #As you can see in row 18 and 28, the empty value from "Calories" was replaced with the mode: 300.0


## Pandas - Cleaning Data of Wrong Format

### Convert Into a Correct Format

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# df['Date'] = pd.to_datetime(df['Date'])

# print(df.to_string())


### Removing Rows

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# df['Date'] = pd.to_datetime(df['Date'])

# df.dropna(subset=['Date'], inplace = True)

# print(df.to_string())


## Pandas - Fixing Wrong Data

### Replacing Values

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# df.loc[7,'Duration'] = 45

# print(df.to_string())


In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# for x in df.index:
#   if df.loc[x, "Duration"] > 120:
#     df.loc[x, "Duration"] = 120

# print(df.to_string())


### Removing Rows

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# for x in df.index:
#   if df.loc[x, "Duration"] > 120:
#     df.drop(x, inplace = True)

# #remember to include the 'inplace = True' argument to make the changes in the original DataFrame object instead of returning a copy

# print(df.to_string())


## Pandas - Removing Duplicates

### Discovering Duplicates

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# print(df.duplicated())


### Removing Duplicates

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# df.drop_duplicates(inplace = True)

# print(df.to_string())

# #Notice that row 12 has been removed from the result


# Pandas - Data Correlations

## Finding Relationships

In [None]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# print(df.corr())


# Pandas - Plotting

## Plotting

In [None]:
# #Three lines to make our compiler able to draw:
# import sys
# import matplotlib
# matplotlib.use('Agg')

# import pandas as pd
# import matplotlib.pyplot as plt

# df = pd.read_csv('data.csv')

# df.plot()

# plt.show()

# #Two  lines to make our compiler able to draw:
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()


## Scatter Plot

In [None]:
# #Three lines to make our compiler able to draw:
# import sys
# import matplotlib
# matplotlib.use('Agg')

# import pandas as pd
# import matplotlib.pyplot as plt

# df = pd.read_csv('data.csv')

# df.plot(kind = 'scatter', x = 'Duration', y = 'Calories')

# plt.show()

# #Two  lines to make our compiler able to draw:
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()


In [2]:
# #Three lines to make our compiler able to draw:
# import sys
# import matplotlib
# matplotlib.use('Agg')

# import pandas as pd
# import matplotlib.pyplot as plt

# df = pd.read_csv('data.csv')

# df.plot(kind = 'scatter', x = 'Duration', y = 'Maxpulse')

# plt.show()

# #Two  lines to make our compiler able to draw:
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()


## Histogram

In [None]:
# #Three lines to make our compiler able to draw:
# import sys
# import matplotlib
# matplotlib.use('Agg')

# import pandas as pd
# import matplotlib.pyplot as plt

# df = pd.read_csv('data.csv')

# df["Duration"].plot(kind = 'hist')

# plt.show()

# #Two  lines to make our compiler able to draw:
# plt.savefig(sys.stdout.buffer)
# sys.stdout.flush()
