# Pandas: Series and DataFrame

In [None]:
import pandas as pd
import numpy as np


# Series


In [None]:
#Creating Series
lis = [np.random.randint(0,100) for x in range(0,6) ]#List comprehension
lis

In [None]:

#Pass the list to the Series constructor
pd.Series(lis)

In [None]:
#Creating Series from Numpy Arrays

lis = np.array(lis)
print("Array:",lis)
print("Series:\n", pd.Series(lis))

In [None]:
label = ['James','John','Jane','Maria','Sam','Jean']

In [None]:
s1 = pd.Series(lis,label,name="Maths")

In [None]:
print(s1)

In [None]:
#Creating Series from Dictionaries
marks =  {'James':90,'John':87,'Jane':76,'Maria':90,'Sam':94,'Jean':76}

In [None]:
marks = pd.Series(marks)
marks

# Retrieving Data from Series

Using index

In [None]:
marks['James']

In [None]:
s1[0]

#You can change or modify the data value of a Series

In [None]:
s1['Jane'] = 89

In [None]:
s1

In [None]:
s1[[0,1]]

In [None]:
#Using loc : To specifically state that we are using labelled index

In [None]:
s1.loc["James"]


In [None]:
s1.loc[["James","Jane"]]


In [None]:
#Using iloc : To specifically state that we are using numeric index

In [None]:
s1.iloc[4]

In [None]:
s1.iloc[[2,3]]

In [None]:
#Deleting Values from Series

In [None]:
s1.drop("Jane")#Note how jane's score still remains in the original Series. To  drop Janes, set the inplace parameter to true!

In [None]:
s1

In [None]:
#Arithmetic operations on Pandas Series

In [None]:
s2 = s1 + 5
s2

In [None]:
#What is the average score?
s1.mean()

In [None]:
s1.idxmin()

In [None]:
s1.idxmax()

In [None]:
np.max(s1)#Highest score

In [None]:

s1["James"] + 20

In [None]:
s1

In [None]:
#Lowest score?


# DataFrame
Is a two dimensional object with rows and columns, can hold and process  data of mixed types!

In [None]:
#Let's create a DataFrame containing scores of 6 students in  6 subjects

In [None]:
score = [ np.random.randint(0,101) for x in range(0,36)]

In [None]:
scores = np.array(score)

In [None]:
scores

In [None]:
student_score = scores.reshape(6,6)

In [None]:
student_score

In [None]:
students = ['James','John','Jane','Maria','Sam','Jean']
subjects = ['Maths','History','Physics','Chemistry','IT','Biology']

In [None]:
df = pd.DataFrame(data= student_score, index=students,columns=subjects)

In [None]:
df

In [None]:
df.iloc[[0]]

In [None]:
#Merging Dataframes
s1 = [90,87,78,95,76]
s2 = [56,67,30,32,31]
names = ['James', 'John','Jane','Maria','Samuel']
df1 = pd.DataFrame({"Maths":s1}, index= names)
df2 = pd.DataFrame({"Physics":s2}, index = names)

In [None]:
df1

In [None]:
df2

In [None]:
df3 = pd.merge(df1,df2,  right_index=True, left_index=True)

In [None]:
df3

In [None]:
import matplotlib.pyplot as plt
display(df.plot(kind='bar'))
#plt.savefig('graph.png')

In [None]:
display(df.loc['James'].plot(kind = 'bar', y='Marks'))


In [None]:
#Let's retrieve the index of our dataframe, this contains students' names

In [None]:
df.index#returns the index as an object

In [None]:
#Let's retrieve the columns of our dataframe, this contains the subjects!

In [None]:
df.columns#returns the columns as an object

In [None]:
df.values#returns the data values as a 2-d array

In [None]:
df.describe()#descriptive statistics of our dataframe

# Retrieving Data from DataFrame

In [None]:
#let's retrieve all scores for Biology

x = df["Biology"]
print(type(x))

In [None]:
#for multiple columns
df[["Biology","Chemistry"]]

In [None]:
#What did James score in Maths? Do not use the index in the first []!
df['Maths']['James']

In [None]:
#Retrieving a row. What did John score in all subjects?

In [None]:
df.loc[["John"]]

In [None]:
df.loc[["John"]]#

# Adding Columns to DataFrame

In [None]:
#Adding new Columns. Let's add the scores of all students in Geography!
df["Geography"] = [56,78,86,34,45,70]

In [None]:
df

In [None]:
#Adding new rows. Let's add the scores for Patrick.
Patrick = {"Maths":78,"History":67,"Physics":90,"Chemistry":76,"IT":98,"Geography":78}


In [None]:
patrick_df = pd.DataFrame(Patrick, index=["Patrick"])

In [None]:
patrick_df

In [None]:
df_1 = df.append(patrick_df, sort=False)
df_1


In [None]:
#df.fillna(df['Biology'].mean())

In [None]:
df_1['Biology']['Patrick'] = 89
df_1

In [None]:
#Inserting columns to specific location. Let's add scores for new subject(Arts) at column(0)

df.insert(0,"Arts",[90,67,88,98,67,85])

In [None]:
df

# Deletion
You can delete using pop and drop functions of DataFrame

In [None]:
# Deleting using pop and drop functions
#Pop deletes columns, drop deletes both columns and rows


In [None]:
df.pop("Geography")

In [None]:
df

In [None]:
#Deleting columns using drop function, remember to set the axis to 1
dropped = df.drop(["Arts", "Biology"], axis=1)
dropped

In [None]:
#deleting rows using drop function...remember to set the axis to 0.
dropped = df.drop(["James", "John"], axis=0)
dropped

In [None]:
#Changing row/columns label using rename function
df.rename(columns={"IT":"Computing"})

In [None]:
#Changing row/columns label using rename function
df.rename(index={"Sam":"Samuel"})

# Cleaning Data 
How to deal with missing values.

In [None]:
import math
NaN = float('nan')
df["Agriculture"] = [56,78,NaN,34,NaN,70]

#df["Maths"]["Jean"]=NaN
#df["Biology"]["Jean"]=NaN

In [None]:
df_null = df.isnull()
df_null

In [None]:
df_null["Maths"]

In [None]:
df_null.sum()#number of NaN in the DF

In [None]:
df.count()#counts the number of non null values

In [None]:
#Dealing with NaN values: This can be achieved by using dropna function  or fillna function.

In [None]:
df.dropna(axis=0)#drops the rows with the missing values

In [None]:
df.dropna(axis=1)#drops the columns with the missing values

In [None]:

#Filling missing values with 0
df.fillna(0)

In [None]:
# forward fill:replaces missing values from previous item in the column or row

df.fillna(method='ffill', axis=0)

In [None]:
df

In [None]:
#propagate non-null values forward
df.fillna(method='ffill', axis=1)

In [None]:
#propagate non-null values backward
df.fillna(method='backfill',axis=0)#Backward fill

In [None]:
df

In [None]:
#Filling missing values using interpolation functions
df.interpolate(method='linear',axis=1)

In [None]:
#Loading data into DataFrames. We can load data files into Pandas DataFrame.
#This is a real-world data containing labelled smartphone sensor data


In [None]:
#Link to User context recognition using smartphone sensors and classification models
#https://www.sciencedirect.com/science/article/pii/S1084804516300261
context = pd.read_csv("mobile_context.csv", index_col=0)

In [None]:
context.head(20)

In [None]:
context.tail(10)

In [None]:
context.isnull().any()

In [None]:
time =  context['time']

In [None]:
time.head(20)

In [None]:
context.describe()

In [None]:
context.columns

In [None]:
context["lux"].describe()

In [None]:
#Retrieve specific columns into a new DataFrame?

#Let's load another data from CSV file

In [None]:
pop = pd.read_csv("state-population.csv")

In [None]:
pop.head()

In [None]:
population = pop.rename(columns={"state/region":"state"})#Changes the column label

In [None]:
population.head()

In [None]:
%matplotlib inline
population.groupby(['year'])['population'].mean().plot()

In [None]:
population.groupby(['state'])['population'].mean()

In [None]:
population.groupby(['state','year'])['population'].mean()

In [None]:
%time
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
wine = pd.read_csv(url,sep=';')

#You can use the pd.to_csv() to persist the wine data to your local drive.

In [None]:
wine.head()#Displays the first 5 records of the wine DataFrame, you can change the number of records to be displayed.

In [None]:
wine.info()#Displays the total number of entries, columns( with data types) and the memory usage of the dataset

In [None]:
wine.values#

In [None]:
#What is the average alcohol % per vol?

wine['alcohol'].mean()

In [None]:
#Let's group the wine according to quality and visualize the grouping

wine.groupby("quality").plot()

In [None]:
wine.head()

In [None]:
wine.plot(kind="scatter",x="quality",y="alcohol")

In [None]:
#Compute the wine quality and alcohol ratio

In [None]:
wine['qual_alc ratio'] = wine['quality']/wine['alcohol']

In [None]:
wine.head()

In [None]:
#Sort the wine in descending order of quality

In [None]:
wino= wine.sort_values(by='quality', ascending=False)

In [None]:
wino.head(5)