# Pandas Tutorial

Pandas is a Python library providing high-performance, easy-to-use data structures and data analysis tools

Pandas deals with the following three data structures:

    Series
    DataFrame
    Panel (ignore this one for now)

### Importing the modules

In [None]:
#!pip install pandas

In [1]:
import pandas as pd
import numpy as np

## Pandas.series
Series is a one-dimensional labeled array capable of holding data of one type.

pandas.Series(data, index, dtype, copy)

### Create empty series

In [2]:
s = pd.Series()
print(s)

Series([], dtype: float64)


### Create series from ndarray

In [3]:
data = np.array(['a','b','c','d']) 
s = pd.Series(data) #default indexing was used

print(data)
print('\n')
print(s)

['a' 'b' 'c' 'd']


0    a
1    b
2    c
3    d
dtype: object


In [4]:
s

0    a
1    b
2    c
3    d
dtype: object

In [None]:
print(s.index)
print(s.index.tolist())

In [None]:
s = pd.Series(data,index=['100','101','102','103'])

print(data)
print('\n')
print(s)

In [None]:
s.index.tolist()

### Create series from scalar

In [None]:
s = pd.Series(5, index=[0, 1, 2, 3, 4, 5, 23])
print(s)

### Create series from dictionary

In [None]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data)

print(data)
print('\n')
print(s)

In [None]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data, index = ['a',1,2])

print(data)
print('\n')
print(s)

In [None]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data)
s.index = [0,1,2]
print(data)
print('\n')
print(s)

In [None]:
s.index

### Accessing elements with index

In [None]:
s = pd.Series([1,2,3,4,5],index = ('a','f','b','q','c'))

print(s)
print('\n')

print('Forth element')
print(s[3])
print('\n')

print('Elements 3-5')
print(s[2:4])
print('\n')

print('The last 3 elements')
print(s[-3:])
print('\n')

print('Letter indexing')
print(s['a':'b'])

In [None]:
s = pd.Series([1,2,3,4,5],index = (0, 2, 3, 4, 7))

print(s)
print('\n')

print('Forth element')
print(s[2:4])
print('\n')

## Pandas.DataFrame
Two-dimensional data structure, columns can be of different data types (and usually are :))

pandas.DataFrame(data, index, columns, dtype)

### Create an empty dataframe

In [None]:
df = pd.DataFrame()
print(df)

### Create dataframe from list

In [None]:
data = [1,2,3,4,5]
#df = pd.DataFrame(data)
df = pd.DataFrame(data)
print(df)

In [None]:
data = [1,2,3,4,5]
#df = pd.DataFrame(data)
df = pd.DataFrame(data, columns = ['first'])
print(df)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print(df)
df

### Create dataframe from a dictionary

In [None]:
data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
df1 = pd.DataFrame.from_dict(data)
df1

In [None]:
?pd.DataFrame.from_dict

In [None]:
data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
df1 = pd.DataFrame.from_dict(data, orient='index', 
                             columns=['c1', 'c2', 'c3', 'c4'])
df1

In [None]:
df1.dtypes

In [None]:
data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c']}
df1 = pd.DataFrame.from_dict(data, orient='index', 
                             columns=['c1', 'c2', 'c3', 'c4'])
df1

### Give column names

In [None]:
df1

In [None]:
df1.columns

In [None]:
df1.columns.tolist()

In [None]:
df1.columns = ['A', 'B', 'C', 'D']
df1

### Create dataframe from a list of dictionaries

In [None]:
?pd.DataFrame

In [None]:
data = [{'a': 1, 'b': 2, 'd': 0},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second']) 

print (df1)

In [None]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b']) 

print (df1)

In [None]:
#try to add 'c' to the list of columns
data = [{'a': 1, 'b': 2, 'd': 0},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b', 'c', 'd']) 

print (df1)

### Selecting columns

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])

print(df)
print('\n')
print(df['Age'])
print(type(df['Age']))

In [None]:
print(df)
print(df[['Age']])
print(df[['Age', 'Name']])

In [None]:
print(df[['Weight', 'Name']])

### Selecting rows and columns

In [None]:
df

In [None]:
df['Name']

In [None]:
df['Name'][1]

### .loc, .iloc(use integer indexing) and .ix(the mix of 2, deprecated)

In [None]:
np.random.randn(8, 4)

In [None]:
?np.random.randn

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])

#select all rows for specific columns
print(df)
print('\n')
print(df.loc[:,['A','B']])
print('\n')

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


#select some rows for specific columns
print(df)
print('\n')
print(df.loc[['a','b'],['A','B']])
print('\n')



In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


print(df)
print('\n')
print(df.loc['a']>0)
print('\n')

In [None]:
df.loc['a','B'] = 10000
df

In [None]:
df['A'] = 10000
df

In [None]:
print(df[2:3])
df[3:5]

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])

print(df)
print('\n')
print(df.iloc[[0,1,3],[0,1]])
print('\n')

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


print(df)
print('\n')
print(df.iloc[0:3,[0,1]])
print('\n')

### Filtering by a column condition

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])
df

In [None]:
print(df)
df1 = df[df['Age']==10]
df1

In [None]:
print(df)
df[(df['Age']>10) & (df['Year']>2000)]

### Adding a column

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])


df

In [None]:
len(df)

In [None]:
#df['Gender']=pd.Series(['male','male','male'])
#df['Gender'] = 'male'
df['Gender'] = ['male','male','male']
print(df)

In [None]:
type(df['Gender'])

In [None]:
df['Gender'] = 1
df

### Column deletion

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])


print(df)
print('\n')

del df['Year']
print(df)

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])

print(df)
print('\n')

df.drop('Year', axis=1, inplace=True)

In [None]:
df

### Slice rows

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'],index=[2,5,1,0,3])

print(df)
print('\n')
print(df[2:4])

## Some series and dataframe functions

In [None]:
np.random.randn(4)

In [None]:
s = pd.Series(np.random.randn(4))
print(s)
print('\n')
print ("The axes are:")
print(s.axes)
print(list(s.axes[0]))
print(s.index)
print(list(s.index))

In [None]:
s = pd.Series(np.random.randn(4))
print(s)
print('\n')
print ("The data type is:")
print(s.dtype)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
print(df.dtypes)

In [None]:
print(s)
print('\n')
print ("Is the Object empty?")
print(s.empty)

In [None]:
print(s)
print('\n')
print ("The dimensions of the object:")
print(s.ndim)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
print(df.ndim)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print ("Our object is:")
print(df)
print('\n')

print ("The shape of the object is:")
print(df.shape)

In [None]:
print ("Our object is:")
print(df)

print('\n')
print('The first 2 rows of the dataframe:')
print(df.head(2))

print('\n')
print('The last 2 rows of the dataframe:')
print(df.tail(2))

In [None]:
#df[:10] = df.head(10)

In [None]:
?df.head

### Count the number of values in a column

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Clarke',18]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
x = df['Name'].value_counts()
print(type(x))
print(x)

# Missing values

In [None]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df)

In [None]:
df['one'].isnull()

In [None]:
df['one'].isna()

In [None]:
df['one'].isna().sum()

In [None]:
df['one'].notnull()

In [None]:
len(df[df['one'].isna()])

# Replacing the missing data

In [None]:
df

In [None]:
df1 = df.fillna(method = 'bfill')
df1 

### See some other options that fillna() provides in the python documentation :)

In [None]:
?df.fillna

In [None]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df)

In [None]:
df.fillna(np.mean(df))

In [None]:
np.mean(df)

In [None]:
df['one'] = df['one'].fillna(0)

In [None]:
df['one'] = df['one'].fillna(np.mean(df['one']))

In [None]:
df

In [None]:
df.index = np.arange(1, len(df)+1)

In [None]:
df

### Dropping the missing data

In [None]:
df2 = df.dropna()
df2

In [None]:
?df.dropna

### Replacing regular values

In [None]:
df = pd.DataFrame({'one':[10,20,30,40,50,2000],'two':[1000,0,30,40,50,60]})

print(df)
print('\n')
df1 = df.replace({1000:10,2000:60})
df1

# Getting unique values in a column

In [None]:
import pandas as pd
data = [['Alex',10],['Bob',12],['Clarke',10],['Clarke',18]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
df['Name']

In [None]:
#unique values
df['Name'].unique()

In [None]:
#number of unique values
df['Name'].nunique()

## Descriptive statistics

In [None]:
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}

#Create a DataFrame
df = pd.DataFrame(d)
df

## mean()
returns the average value

In [None]:
help(df.mean())

In [None]:
df

In [None]:
df.mean()  #of the numerical columns

In [None]:
df[['Age', 'Rating']].sum()

## std()
returns standard deviation only for the numerical columns

In [None]:
df.std()

## Summarizing Data
describe() computes the summary of statistics

In [None]:
df

In [None]:
df.describe()

# Loading data from a file into a dataframe

In [None]:
s = pd.read_csv('Practical_hw/data/gender.txt')
s

In [None]:
?pd.read_csv

In [None]:
s = pd.read_csv('Practical_hw/data/gender.txt', sep='|', index_col = 'user_id')
s

In [None]:
s = pd.read_csv('Practical_hw/data/gender.txt', sep='|')
s

## Renaming rows and columns

In [None]:
print(df1)
print (df1.rename(columns={'one' : '1', 'two' : '2'}, index = {0 : 'apple', 1 : 'banana', 2 : 'durian'}))

In [None]:
df1.columns = ['a','b']
df1

## Reindexing

In [None]:
df1 = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
df_reindexed = df1.reindex(index=[0,2,5,10,8])
#df1.index = [0,2,5,10,8] inplace
print(df1)
print('\n')
print(df_reindexed)

In [None]:
df1 = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(7,3),columns=['col1','col2','col3'])


print(df1)
print('\n')

print(df2)
print('\n')


#df = df2.reindex_like(df1)
df = df1.reindex_like(df2)
print(df)



In [None]:
df1 = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(7,3),columns=['col2','col3','col4'])


print(df1)
print('\n')

print(df2)
print('\n')

df1.columns = df2.columns
df1

## Changing the indexing

In [None]:
df1 = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])

print(df1)
print('\n')

df1.index = [0,2,5,10,8]
df1

## Groupby

In [None]:
import pandas as pd

ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
                     'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)

print(df)
print('\n')
print (df.groupby('Team'))

In [None]:
x = df.groupby('Team')
print (x.mean())

In [None]:
print (df.groupby('Team').median())
print (df.groupby('Team')[['Rank', 'Points']].median())

### View the groups

In [None]:
df

In [None]:
print (df.groupby('Team').groups)

## Aggregations
An aggregated function returns a single aggregated value for each group. 

In [None]:
new_df = df.groupby('Team').agg({'Rank' : 'sum', 'Points' : 'max'})
new_df

In [None]:
print(new_df.index)
print('\n')
print(new_df.columns)

In [None]:
new_df = df.groupby('Team').agg({'Rank' : 'sum', 'Points' : 'max'}).reset_index()
new_df

In [None]:
print(new_df.index)
print('\n')
print(new_df.columns)

In [None]:
new_df = df.groupby('Team').agg({'Rank' : ['sum', 'mean'], 'Points' : ['min', 'max']})
new_df

In [None]:
print(new_df.index)
print('\n')
print(new_df.columns)

In [None]:
new_df[('Rank', 'sum')]

In [None]:
new_df = df.groupby('Team')['Rank'].agg(['min', 'max'])
new_df

## Merging/Joining

In [None]:
df1 = pd.DataFrame({
         'id':[1,2,3,4,5],
         'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
         'subject_id':['sub1','sub2','sub4','sub6','sub5']})
df2 = pd.DataFrame(
         {'id':[1,2,3,4,5],
         'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
         'subject_id':['sub2','sub4','sub3','sub6','sub5']})
print (df1)
print('\n')
print (df2)

### Merge Two DataFrames on a Key

In [None]:
pd.merge(df1,df2,on='id') 

In [None]:
df1 = pd.DataFrame({
         'id':[1,2,3,4,5],
         'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
         'subject_id':['sub1','sub2','sub4','sub6','sub5']})
df2 = pd.DataFrame(
         {'my_id':[1,2,3,4,5],
         'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
         'subject_id':['sub2','sub4','sub3','sub6','sub5']})
print (df1)
print('\n')
print (df2)

In [None]:
pd.merge(df1,df2,left_on='id', right_on = 'my_id')

In [None]:
df1 = pd.DataFrame({
         'id':[1,2,3,4,5],
         'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
         'subject_id':['sub1','sub2','sub4','sub6','sub5']})
df2 = pd.DataFrame(
         {'id':[1,2,3,4,5],
         'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
         'subject_id':['sub2','sub4','sub3','sub6','sub5']})
print (df1)
print('\n')
print (df2)

In [None]:
# looks for equal tuples
print(df1)
print('\n')
print(df2)

pd.merge(df1,df2,on=['id','subject_id'])

In [None]:
?pd.merge

### Left join

In [None]:
print (df1)
print('\n')
print (df2)

In [None]:
pd.merge(df1,df2, on='subject_id', how='left')

### Right join

In [None]:
pd.merge(df1,df2, on='subject_id', how='right')

### Outer join

In [None]:
print (df1)
print('\n')
print (df2)

In [None]:
#take all values of subject_id
pd.merge(df1,df2, how='outer', on='subject_id')

### Inner join

In [None]:
#take common values of subject_id
pd.merge(df1, df2, on='subject_id', how='inner')

In [None]:
pd.merge(df1,df2,on=['id','subject_id'], how = 'left')

In [None]:
?pd.merge

In [None]:
pd.merge(df2, df1, on='subject_id', how='inner')