# Pandas Tutorial

Pandas is a Python library providing high-performance, easy-to-use data structures and data analysis tools

Pandas deals with the following three data structures:

    Series
    DataFrame
    Panel (ignore this one for now)

### Importing the modules

In [None]:
import pandas as pd
import numpy as np

## Pandas.series
Series is a one-dimensional labeled array capable of holding data of one type.

pandas.Series(data, index, dtype, copy)

### Create empty series

In [None]:
s = pd.Series()
print(s)

### Create series from ndarray

In [None]:
data = np.array(['a','b','c','d']) 
s = pd.Series(data) #default indexing was used

print(data)
print('\n')
print(s)

In [None]:
print(s.index)
print(s.index.tolist())

In [None]:
s = pd.Series(data,index=['100','101','102','103'])

print(data)
print('\n')
print(s)

In [None]:
s.index.tolist()

### Create series from scalar

In [None]:
s = pd.Series(5, index=[0, 1, 2, 3, 4, 5, 23])
print(s)

### Create series from dictionary

In [None]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data)

print(data)
print('\n')
print(s)

In [None]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data, index = ['a',1,2])

print(data)
print('\n')
print(s)

In [None]:
s.index

### Accessing elements with index

In [None]:
s = pd.Series([1,2,3,4,5],index = ('a','f','b','q','c'))

print(s)
print('\n')

print('Third element')
print(s[3])
print('\n')

print('Elements 2-4')
print(s[2:4])
print('\n')

print('The last 3 elements')
print(s[-3:])
print('\n')

print('Letter indexing')
print(s['a':'f'])

## Pandas.DataFrame
Two-dimensional data structure, columns can be of different data types (and usually are :))

pandas.DataFrame(data, index, columns, dtype)

### Create an empty dataframe

In [None]:
df = pd.DataFrame()
print(df)

### Create dataframe from list

In [None]:
data = [1,2,3,4,5]
#df = pd.DataFrame(data)
df = pd.DataFrame(data)
print(df)

In [None]:
data = [1,2,3,4,5]
#df = pd.DataFrame(data)
df = pd.DataFrame(data, columns = ['first'])
print(df)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print(df)
df

### Create dataframe from a dictionary

In [None]:
data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
pd.DataFrame.from_dict(data)

In [None]:
?pd.DataFrame.from_dict

In [None]:
data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
df1 = pd.DataFrame.from_dict(data, orient='index', 
                             columns=['c1', 'c2', 'c3', 'c4'])
df1

### Give column names

In [None]:
df1

In [None]:
df1.columns.tolist()

In [None]:
df1.columns = ['A', 'B', 'C', 'D']
df1

### Create dataframe from a list of dictionaries

In [None]:
?pd.DataFrame

In [None]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second']) 

print (df1)

In [None]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b']) 

print (df1)

In [None]:
#try to add 'c' to the list of columns
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b', 'c', 'd']) 

print (df1)

### Selecting columns

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])

print(df)
print('\n')
print(df['Age'])
print(type(df['Age']))

In [None]:
print(df)
print(df[['Age']])
print(df[['Age', 'Name']])

### Selecting rows and columns

In [None]:
df

In [None]:
df['Name'][1]

### .loc, .iloc(use integer indexing) and .ix(the mix of 2, deprecated)

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])

#select all rows for specific columns
print(df)
print('\n')
print(df.loc[:,['A','B']])
print('\n')

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


#select some rows for specific columns
print(df)
print('\n')
print(df.loc[['a','b'],['A','B']])
print('\n')



In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


print(df)
print('\n')
print(df.loc['a']>0)
print('\n')

In [None]:
df.loc['a','B'] = 10000
df

In [None]:
df.loc['a','B'] = 10000
df

In [None]:
df['A'] = 10000
df

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])

print(df)
print('\n')
print(df.iloc[[0,1],[0,1]])
print('\n')

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


print(df)
print('\n')
print(df.iloc[0:3,[0,1]])
print('\n')

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


print(df)
print('\n')
print(df.ix[:,'A'])

### Filtering by a column condition

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])
df

In [None]:
print(df)
df[df['Age']==10]

In [None]:
print(df)
df[(df['Age']>10) & (df['Year']>2000)]

### Adding a column

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])


df

In [None]:
df['Gender']=pd.Series(['male','male','male'])
df['Gender'] = 'male'
print(df)

In [None]:
df['Gender'] = 1
df

In [None]:
len(df)

### Column deletion

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])


print(df)
print('\n')

del df['Year']
print(df)

### Slice rows

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'],index=[2,5,1,0,3])

print(df)
print('\n')
print(df[2:4])

## Some series and dataframe functions

In [None]:
np.random.randn(4)

In [None]:
s = pd.Series(np.random.randn(4))
print(s)
print('\n')
print ("The axes are:")
print(s.axes)
print(list(s.axes[0]))
print(s.index)

In [None]:
s = pd.Series(np.random.randn(4))
print(s)
print('\n')
print ("The data type is:")
print(s.dtype)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
print(df.dtypes)

In [None]:
print(s)
print('\n')
print ("Is the Object empty?")
print(s.empty)

In [None]:
print(s)
print('\n')
print ("The dimensions of the object:")
print(s.ndim)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
print(df.ndim)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print ("Our object is:")
print(df)
print('\n')

print ("The shape of the object is:")
print(df.shape)

In [None]:
print ("Our object is:")
print(df)

print('\n')
print('The first 2 rows of the dataframe:')
print(df.head(2))

print('\n')
print('The last 2 rows of the dataframe:')
print(df.tail(2))

In [None]:
?df.head

### Count the number of values in a column

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Clarke',18]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
x = df['Name'].value_counts()
print(type(x))
print(x)

In [None]:
df.reindex()

# Missing values

In [None]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df)

In [None]:
df['one'].isnull()

In [None]:
df['one'].notnull()

# Replacing the missing data

In [None]:
df

In [None]:
df1 = df.fillna(method = 'bfill')
df1 

### See some other options that fillna() provides in the python documentation :)

In [None]:
?df.fillna

### Dropping the missing data

In [None]:
df2 = df.dropna()
df2

### Replacing regular values

In [None]:
df = pd.DataFrame({'one':[10,20,30,40,50,2000],'two':[1000,0,30,40,50,60]})

print(df)
print('\n')
df1 = df.replace({1000:10,2000:60})
df1

# Geting unique values in a column

In [None]:
import pandas as pd
data = [['Alex',10],['Bob',12],['Clarke',10],['Clarke',18]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
df['Name']

In [None]:
#unique values
df['Name'].unique()

In [None]:
#number of unique values
df['Name'].nunique()

# Loading data from a file into a dataframe

In [None]:
s = pd.read_csv('Practical/data/gender.txt')
s

In [None]:
?pd.read_csv

In [None]:
s = pd.read_csv('Practical/data/gender.txt', sep='|', index_col = 'user_id')
s