**Pandas**

# Series

In [None]:
import pandas as pd 
import numpy as np

**1- Create Series**

In [None]:
# Create empty series
s = pd.Series()

In [None]:
s

**1.1** Create Series from ndarray

In [None]:
intArr = np.arange(10)

sArr = np.array(['a', 'b','c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])

intSer = pd.Series(data = intArr)

sSer = pd.Series(data = sArr)


# We did not pass any index, so by default, 
# it assigned the indexes ranging from 0 to len(data)-1

In [None]:
intSer

In [None]:
sSer

In [None]:
# Create Series by index
ser = pd.Series(data=intArr, index=['a', 'b','c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
print(ser)

**1.2** Create Series from dictionary

In [None]:
# Create Series using dictionary

dictData = {'a' : 0., 'b' : 1., 'c' : 2.}

s = pd.Series(data = dictData)
print(s)

In [None]:
# Create Series using dictionary and passing index 
dictData = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data = dictData, index=[0,1,2])
print(s)

In [None]:
dictData = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(dictData.values(), index=[0,1,2])
print(s)

In [None]:
dictData = {'a' : 0., 'b' : 1., 'c' : 2., 'd': 2}
s = pd.Series(dictData.keys(), index=dictData.values())
print(s)

In [None]:
s[2.0]

In [None]:
dictData = {'a' : 0., 'b' : 1., 'c' : 2., 'd': 2}
s = pd.Series(dictData.keys(), index=[0,1,2,2])
print(s)

**1.3** Create Series from list

In [None]:
listData = [34, 21, "40", 7.9]

ser = pd.Series(data=listData)

print(ser)

In [None]:
# Name the Series
ser = pd.Series(data = listData, name="SeriesFromList")

In [None]:
ser.name

In [None]:
# rename Series
ser.name = 'MahmoudSeries'

print(ser)

In [None]:
# Get Series index 
ser.index

In [None]:
list(ser.index)

In [None]:
intArr = np.arange(10)
ser = pd.Series(data=intArr, index=['a', 'b','c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
print(ser.index)

In [None]:
# Access to values in Series as array
ser.values

In [None]:
# You can access the underlying data array using .to_numpy():
ser.to_numpy()


**2- Indexing and Slicing Series**

**2.1** Integer Indexing

In [None]:
ser.index

In [None]:
# Access to the first index in Series

print("By 'a' index : ", ser["a"])
print("By 0 index : ", ser[0])

In [None]:
ser[0:6]

In [None]:
# Slicing data from 0 to 6 by step 2
ser[0:6:2]

In [None]:
# Slicing data using characters 
ser['a':'i']

In [None]:
ser['a':'i' :3 ]

In [None]:
ser = pd.Series(data = [10, 20,30], index=[1, 5 , 68])

In [None]:
ser

In [None]:
ser[1]

In [None]:
ser[2]

**2.2** Boolean Indexing

In [None]:
# Return boolean values based on data wich greater than 5 
ser > 5 

In [None]:
# Return value from Series where have True value
ser[ ser > 5 ]

Statistics in Pandas 

In [None]:
# Get mean value from thr Series
print("Mean value : ", ser.mean())

In [None]:
# Get mean value from thr Series
print("Standard Deviation : ", ser.std())

In [None]:
print("Quantiles values : ")
print("Q1 : ", ser.quantile(0.25))
print("Q2 : ", ser.quantile(0.50))
print("Q3 : ", ser.quantile(0.75))

Check if value in Series index

In [None]:
ser

In [None]:
'a' in ser

In [None]:
'z' in ser

In [None]:
1 in ser.index

# Data-Frames

**1- Create Data-Frame**

In [115]:
import pandas as pd 

In [116]:
# Create empty Data-Frame
df = pd.DataFrame()

In [117]:
df

**1.1**- Create Data-Frame from list 

In [118]:
# Crate Data-Frame from 2D lists
data_1 = [['Mahmoud',98.0],['Ahmad',79.7],['Rand',80.9]]

df = pd.DataFrame(data = data_1, columns=["name", "gpa"])

In [119]:
df

Unnamed: 0,name,gpa
0,Mahmoud,98.0
1,Ahmad,79.7
2,Rand,80.9


**2.1**- Create Data-Frame from dictionary

In [120]:
data = {'Name':['Mahmoud', 'Ahmad', 'Rand', 'Rami'],
        'Age':[28,34,29,42]}

In [121]:
df = pd.DataFrame(data)

In [122]:
df

Unnamed: 0,Name,Age
0,Mahmoud,28
1,Ahmad,34
2,Rand,29
3,Rami,42


Create a DataFrame from List of Dicts

In [123]:
data = [{'a': 5, 'b': 10, 'c': 20}, {'a': 1, 'b': 2}, {'a': 50, 'b': 50, 'c': 50}]
df = pd.DataFrame(data)

In [124]:
df

Unnamed: 0,a,b,c
0,5,10,20.0
1,1,2,
2,50,50,50.0


In [125]:
# Create Data-Frame from list of dictionaries where keys are different from dictionary to another
data = [{'a': 5, 'b': 10, 'c': 20}, {'a': 1, 'b': 2}, {'a': 50, 'b': 50, 'c': 50, 'd':0}]
df = pd.DataFrame(data)
print(df)

    a   b     c    d
0   5  10  20.0  NaN
1   1   2   NaN  NaN
2  50  50  50.0  0.0


Create a DataFrame from Series

In [126]:
# Create Data-Frame from Series with index 

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(data = d)

In [127]:
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [128]:
# Create Data-Frame from Series with index 

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
   'three' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'f'])}

df = pd.DataFrame(data = d)

In [129]:
df

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
b,2.0,2.0,2.0
c,3.0,3.0,3.0
d,,4.0,
f,,,4.0


In [130]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
   'three' : pd.Series([1, 2, 3, 4])}

df = pd.DataFrame(data = d)
df

Unnamed: 0,one,two,three
a,1.0,1.0,
b,2.0,2.0,
c,3.0,3.0,
d,,4.0,
0,,,1.0
1,,,2.0
2,,,3.0
3,,,4.0


In [131]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(data = d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [None]:
import numpy as np

In [132]:
type(np.NaN)

float

Add Column into Data-Frame

In [None]:
# Add column by Series
df["three"] = pd.Series([10,20,30],index=['a','b','c'])

In [None]:
# Add column by constant
df["four"] = 4

In [None]:
df

Delete Column from Data-Frame

In [None]:
# using del function
del df["four"]

print(df)

In [None]:
# using pop
d = df.pop("two")

In [None]:
print(d)
print()
type(d)

In [None]:
df 

Create Data-Frame from csv file

In [None]:
df = pd.read_csv("/content/sample_data/mnist_train_small.csv")

In [None]:
df

In [None]:
df = pd.read_csv("/content/sample_data/california_housing_train.csv")

In [None]:
df

In [None]:
# Return first n of rows from Data-Frame
# head() 
# By default return first 5 rows

df.head()

In [None]:
# Return first 3 rows
df.head(3)

In [None]:
df.head(9)

In [None]:
# Return last n of rows from Data-Frame
# tail() 
# By default return first 5 rows

df.tail()

In [None]:
# Return the last row in Data-Frame
df.tail(1)

In [None]:
df.tail(6)

In [None]:
df.info()

In [None]:
# Get shape of Data-Frame
df.shape

Select Rows by label/column name

loc : location

In [None]:
# Get columns names from Data-Frame
df.columns

list(df.columns)

In [None]:
# Select from one column
df.loc[ : , 'longitude']

In [None]:
df.loc[5:15 , 'longitude']

In [None]:
ser = df.loc[10:20:2 , 'longitude']

print(ser)

In [None]:
ser[10]

In [None]:
# Select from multiple columns
df.loc[ :10 , ['total_rooms','total_bedrooms', 'population'] ]

iloc : index location

In [None]:
# Select one column by column index
df.iloc[ :10 , 2 ]

In [None]:
# Select multiple columns by columns indecies
df.iloc[:10 , [ 2 , 5 , 7] ]

In [None]:
# Select multiple columns by slicing columns indecies
df.iloc[:10 , 2:8]

In [None]:
# Select multiple columns by slicing columns indecies with steps
df.iloc[:10:2 , 2:8:2]

In [None]:
df

In [None]:
df.index

Describe data in Data-Frame

In [None]:
# Describe all numaric data in Data-Frame
df.describe()

In [None]:
desc = df.describe()

In [None]:
# Get the type of desc
type(desc)

In [None]:
# Get columns
desc.columns

In [None]:
# Get indecies in desc Data-Frame
desc.index

In [None]:
# Get shape of data
desc.shape

Slicing data from Data-Frame

In [None]:
desc["longitude"]

In [None]:
desc['max' : ]

In [None]:
desc['std' : '75%']

In [None]:
desc[1:4]

In [None]:
desc[:8:2]

In [None]:
desc[:8:2][:3]

In [None]:
desc[:8:2][:3]["std" :]["median_house_value"]

In [None]:
df

In [None]:
# Return the mean of median_house_value
print(df["median_house_value"].mean())
print(df.loc[: , ["median_house_value"]].mean())

In [None]:
# Return the std of last column  using column index
df.iloc[:, -1 ].std()

In [None]:
df.shape

In [None]:
df_shape = df.shape

print(df_shape)

In [None]:
df_shape[-1]

In [None]:
# Create ndarray with the same shape of Data-Frame

# Solution 1
arr = np.empty(df_shape)
print(arr.shape)


# Solution 2

arr = np.zeros(df_shape)
print(arr.shape)

# Solution 3 
arr = np.array([1])
arr = np.resize(arr, df_shape)
print(arr.shape)


In [None]:
df

In [None]:
df.shape[0]

In [None]:
# Loop on rows in Data-Frame
# print median_house_value when totla_rooms = 2000

for i in range(df.shape[0]):
  if df.loc[i ,'total_rooms'] == 2000:
    print( df.loc[i, 'median_house_value'] )


In [None]:
df.columns

In [None]:
counter = 0
for i in range(df.shape[0]):
  if df.loc[i, "housing_median_age"] >= 37:
    counter+=1

print(counter)

In [None]:
df.loc[0 ,'total_rooms']

In [None]:
df

In [None]:
list( df.iterrows())[:2]

In [None]:
for idx, row in df.iterrows():
  # print(row)
  # break
  if row["total_rooms"] == 2000:
    print(row["median_house_value"])

In [None]:
# Return Data-Frame where median_house_value greater than the mean of median_house_value

In [None]:
df.loc[:, "median_house_value"].mean()

In [None]:
df.loc[:, "median_house_value"] > df.loc[:, "median_house_value"].mean()

In [None]:
df[ df.loc[:, "median_house_value"] > df.loc[:, "median_house_value"].mean()  ]

In [None]:
df[df.loc[:, "median_house_value"] > df.loc[:, "median_house_value"].mean()]["population"]

In [None]:
df[df.loc[:, "median_house_value"] > df.loc[:, "median_house_value"].mean()].shape[0]

In [None]:
df[df.loc[:, "median_house_value"] <= df.loc[:, "median_house_value"].mean()].shape[0]

In [None]:
6891 / 17000

Data-Frame size : Number of elements in the Data-Frame

In [None]:
print(df.size)
print(df.shape[0] * df.shape[1]) # n rows * n columns

Data-Frame data types : return data type for each column in Data-Frame

In [None]:
df.dtypes

Transpose Data-Frame

In [None]:
desc

In [None]:
# Transpose desc Data-Frame
dfTransposedDesc = desc.T

In [None]:
dfTransposedDesc

In [None]:
dfTransposedDesc.columns

In [None]:
dfTransposedDesc.index

In [None]:
dfTransposedDesc["mean"]

In [None]:
# df[columns][rows]
dfTransposedDesc["mean"]["longitude"]

In [None]:
dfTransposedDesc["mean"][0:3]

In [None]:
dfTransposedDesc["mean"][["longitude", "population"]]

In [None]:
dfTransposedDesc["min"][["longitude", "population"]]

Values : Returns the actual data in the DataFrame as an NDarray.

In [None]:
df.values

Empty : Returns the Boolean value saying whether the Object is empty or not; True indicates that the object is empty.



In [None]:
df_temp = pd.DataFrame()

In [None]:
df_temp.empty

In [None]:
df.empty

**Manipulate with missing data**

In [None]:
import pandas as pd 
import numpy as np 

Check null values in Data-Frame

In [None]:
data = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
        'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
        'three' :  pd.Series([10,20,30],index=['a','b','c']) }

df_temp = pd.DataFrame(data)
print(df_temp)

In [None]:
type(df_temp["three"]['d'])

In [None]:
df_temp.isnull()

In [None]:
~ df_temp["one"].isnull()

In [None]:
df_temp[~ df_temp["one"].isnull()]

In [None]:
df_temp.notnull()

In [None]:
pd.isnull(df_temp)

Fill NaN

In [None]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, np.nan],
                   [np.nan, 3, np.nan, 4]],
                  columns=list("ABCD"))

print(df)

In [None]:
# Fill NaN values with 0s.

df.fillna(value = 0)

In [None]:
# Fill all NaN values with with mean value of specific column

df.fillna(value = df["B"].mean() )

In [None]:
df

In [None]:
# Replace all NaN elements in column ‘A’, ‘B’, ‘C’, and ‘D’, with 0, 1, 2, and 3 respectively.
values = {"A": 0, "B": 1, "C": 2, "D": 3}

print(df.fillna(value = values))

In [None]:
values = {"A": df["A"].mean(), "B": df["B"].mean(), "C": df["C"].mean(), "D": df["D"].mean()}
print(df.fillna(value = values))

In [None]:
df_2 = df.copy()
for col in df_2.columns:
  df_2[col] = df_2[col].fillna(value=df_2[col].mean())
print(df_2)

In [None]:
df.columns

In [None]:
dict_values = {}

for col in list(df.columns) :
  dict_values[col] = df[col].mean()

print(dict_values)
print(df.fillna(value = dict_values))

In [None]:
# for col in list(df.columns):
#   df[col].fillna(value = df[col].mean(), inplace=True )

# print(df)

In [None]:
df

In [None]:
values = {'A': 3.0, 'B': 3.0, 'C': 5, 'D': 1.7}

In [None]:
# Only replace the first NaN element.

df.fillna(value=values, limit=1)

In [None]:
df

In [None]:
# Only replace the first two NaN element.

df.fillna(value=values, limit=2)

In [None]:
# Fill forward

print(df)

print()

print(df.fillna(method = 'ffill', axis=0))

In [None]:
print(df)

print()

print(df.fillna(method = 'ffill', axis=1))

In [None]:
print(df)

print()

print(df.fillna(method = 'bfill', axis=0 ))

In [None]:
print(df)

print()

print(df.fillna(method = 'bfill', axis=1))

Filling using a DataFrame, replacement happens along the same column names and same indices

In [None]:
df

In [None]:
df2 = pd.DataFrame(np.zeros((4,4 )), columns=list("ABCE"))
print(df2)

In [None]:
df.fillna(df2)

Note that column D is not affected since it is not present in df2.

In [None]:
df = pd.DataFrame({
    'Postal Address': ['New York', np.nan, 'London', 'Mumbai', np.nan],
    'Permanent Address': ['Miami', 'Amsterdam', 'London', 'Rajkot', 'Sydney']
})
print(df)

In [None]:
# inplace : Modify on the same object
df['Postal Address'].fillna(df['Permanent Address'], inplace=True)

In [None]:
df

Drop NaN values fom Data-Frame

In [None]:
#   dropna function along with the axis argument

#   By default, axis=0

#   if any value within a row is NA then the whole row is excluded

In [None]:
df = pd.DataFrame(data = np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],
                  columns=['one', 'two', 'three'])

print(df)

In [None]:
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print(df)

In [None]:
df.dropna(axis =0 )

In [None]:
df.dropna(axis =1)

In [None]:
df

Replace data in Data-Frame

In [None]:
df

In [None]:
df = pd.DataFrame({'one':[10,20,30,40,50,2000], 'two':[1000,0,30,40,50,60]})

df

In [None]:
# Replace one value
 
df.replace(1000, 1500 )

In [None]:
# Replace multiple value
df.replace( { 1000:1500 , 0 :2222 } )

In [None]:
df

**Group by**

Split data

In [None]:
# Define a dictionary containing employee data
data = {'Name':['Jai', 'Anuj', 'Jai', 'Princi','Gaurav', 'Anuj', 'Princi', 'Abhi'],
		     'Age':[27, 24, 22, 32,33, 36, 27, 32],
		     'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj',
				 'Jaunpur', 'Kanpur', 'Allahabad', 'Aligarh'],
		     'Qualification':['Msc', 'MA', 'MCA', 'Phd','B.Tech', 'B.com', 'Msc', 'MA'],
         'Salary' : [400, 600, 500, 600, 550, 370, 800, 1000]}
	

# Create Data-Frame from dictionary of data
df = pd.DataFrame(data)

print(df)


In [None]:
# In order to split the data, we use groupby() function 
# this function is used to split the data into groups based on some criteria

In [None]:
df.groupby('Name').groups

In [None]:
# group the data on Name value.
groupByName = df.groupby("Name")

In [None]:
# first function return Data-Frame of first entries in all the groups formed
groupByName.first()

In [None]:
# last function return Data-Frame of the last entries in all the groups formed
groupByName.last()

In [None]:
# print each group name with data
for name, group in groupByName:
  print(name)
  print(group)

In [None]:
groupByQualification = df.groupby("Qualification")

In [None]:
for name, group in groupByQualification:
  print(name)
  print(group)

In [None]:
#  get_group function retun Data-Frame based on group name 
groupByQualification.get_group('MA')

Grouping data with multiple keys

In [None]:
group = df.groupby(['Name', 'Qualification'])

In [None]:
group.groups

In [None]:
# return Data-Frame based on ('Princi', 'Msc') group name
group.get_group(('Princi', 'Phd'))

In [None]:
# grouping on range of Salary

# group = df.groupby(['Salary', np.arange(100, 1000 , 200)])

In [None]:
df.groupby(['Qualification']).sum()

Applying function to a group

Aggregation is a process in which we compute a summary statistic about each group

In [None]:
# 1- Aggregation 
#  Aggregated function returns a single aggregated value for each group

In [None]:
# performing aggregation using aggregate method
group_1 = df.groupby('Name')

In [None]:
group_1.aggregate(np.sum)

In [None]:
df.groupby(['Name', 'Qualification']).aggregate(np.sum)

In [None]:
# Apply a multiple functions by passing a list of functions.
 
df.groupby(['Name']).aggregate([np.sum, np.mean, np.std])

In [None]:
df.groupby(['Qualification']).aggregate([np.sum, np.mean, np.std])

In [None]:
# Apply a different aggregation to the columns of a dataframe. 
df.groupby(['Qualification']).aggregate({'Age' : np.sum, 'Salary' : np.mean})

Function Application

pipe : Table wise Function Application

apply : Row or Column Wise Function Application

applymap : Element wise Function Application

In [None]:
df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
print(df)

In [None]:
def adder(num1,num2):
   return num1+num2


def addFive(num):
  return num + 5

In [None]:
df

In [None]:
# pipe
df.pipe(adder, 2)

In [None]:
df + 2

Apply

In [None]:
df["col4"] = df["col3"].apply(addFive)
print(df)

In [None]:
data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
        'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
        'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
        'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
        'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}

df = pd.DataFrame(data)

print(df)

In [None]:
dict_teams_nums = {"Riders" : 0, 
          "Devils" : 1,
          "kings"  : 2,
          "Kings"  : 3,
          "Royals" : 4}

In [None]:
dict_teams_nums["Royals"]

In [None]:
{"Riders" : 0, 
          "Devils" : 1,
          "kings"  : 2,
          "Kings"  : 3,
          "Royals" : 4}["Royals"]

In [None]:
def getTeamNumer(team):
  return {"Riders" : 0, 
          "Devils" : 1,
          "kings"  : 2,
          "Kings"  : 3,
          "Royals" : 4} [team]


In [None]:
df

In [None]:
df['Team'].apply(getTeamNumer)

In [None]:
df["intTeam"] = df['Team'].apply(getTeamNumer)

In [None]:
df

In [None]:
df.Team.value_counts()