<a href="https://colab.research.google.com/github/irinavalenzuela/Applied-Data-Science-Python/blob/main/Week2_Pandas_Series.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Data Science in Python


# Week 2: Introduction to Pandas and Series Data

## The Series Data Structure

A way to visualize is in two columns. One is the special index and the other is the actual data. The data column has a label on its own

In [None]:
# Lets import pandas

import pandas as pd

In [None]:
# List of 3 students

students = ['Alice', 'Jack', 'Molly']

# Call the Series function in pandas and pass in the students

pd.Series(students) # Result is a series object

0    Alice
1     Jack
2    Molly
dtype: object

In [None]:
# List of numbers

numbers = [1,2,3]

# Turn into Series

pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [None]:
# Missing data
# Missing data for objects is None
# Missing data for numbers is NaN (= Not a Number). They are treated different

# List of students, and the last one as None

students = ['Alice', 'Jack', None]

pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [None]:
# List of numbers and put None. Panda converts it to NaN

numbers = [1 , 2, None]

pd.Series(numbers) 

# The type is float64. When a numberical variables is not int, but float, then it is likely that it has missing vlues

0    1.0
1    2.0
2    NaN
dtype: float64

In [None]:
# Nan is not equivalent to None for Pandas
# when you see NaN, it's meaning is similar to None, but it's a 
# numeric value and treated differently for efficiency reasons.

# To test for a present of Not a Number, use the isnan() from the numpy library

import numpy as np

np.isnan(np.nan)


True

In [None]:
# Students and their class
students_scores = { 'Alice':'Physics', 'Jack':'Chemistry', 'Molly':'Mathematics'}

s = pd.Series(students_scores)
s

# The list is strings, then the series is object

Alice        Physics
Jack       Chemistry
Molly    Mathematics
dtype: object

In [None]:
# Get the index object using the index attribute

s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [None]:
# Lets create a list of tuples

students = [('Alice','Brown'),('Molly','Green'),('Jack','White')]
pd.Series(students)

# Each of the tuples are stores s objects

0    (Alice, Brown)
1    (Molly, Green)
2     (Jack, White)
dtype: object

In [None]:
# Separate the index from the data

s = pd.Series(['Physics','Chemistry','English'], index=['Alice','Jack','Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [None]:
# Dictionary for three items
students_scores={'Alice':'Physics','Jack':'Chemistry','Molly':'English'}

# Index for 3 students and exclude Jack
s=pd.Series(students_scores,index=['Alice','Molly','Sam'])
s

# Series object does not have Jack in it, but it has Sam as a missing value

Alice    Physics
Molly    English
Sam          NaN
dtype: object

## Querying a Series

In [None]:
# A pandas Series can be queried either by the index position or the index label. 
# If you don't give an index to the series when querying, the position and the label 
# are effectively the same values. 
# To query by numeric location, starting at zero, use the iloc attribute. 
# To query by the index label, you can use the loc attribute. 

import pandas as pd

# Students enrolled from dictionary

students_classes={'Alice':'Physics','Jack':'Chemistry','Molly':'English','Sam':'History'}
s=pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [None]:
# See tehe fourth entry
s.iloc[3]

'History'

In [None]:
# Which class Molly takes. Use loc attribute
s.loc['Molly']



'English'

In [None]:
# Remember that [] is for indexing, and iloc and loc are attributes
# s[3]: it will behave as your are querying using loc attribute. 
s[3]

'History'

In [None]:
s['Sam'] # If you pass an object, it would behave as using loc attribute

'History'

In [None]:
# Class and class code

class_code={99:'Physics',100:'Chemistry',101:'English',102:'History'}

s=pd.Series(class_code)
s

99       Physics
100    Chemistry
101      English
102      History
dtype: object

In [None]:
# There is item in the class list with an index of zero

# s[0] #It goes error

In [None]:
# Average of student grades

grades = pd.Series([90,80,70,60])

total = 0
for grade in grades:
  total+=grade
print(total/len(grades))  

# It works, but it is slow

75.0


In [None]:
# Vectorization works with numpy library
# Numpy sum method

# Import numpy module

import numpy as np

# Call np.sum 
total = np.sum(grades)
print(total/len(grades))

75.0


In [None]:
# Which one is faster?

# Lets create a big series of random numbers

numbers = pd.Series(np.random.randint(0,1000,10000))

# Let see the first 5 rows of the series to see that it is random

numbers.head(5)

0    795
1    172
2    907
3    261
4    745
dtype: int64

In [None]:
# Len of the series to check it is a big number series
len(numbers)

10000

In [None]:
#@title
# Cellular magic function: timeit

# You can give timeit the number of loops that you would like to run. 
# By default, it is 1,000 loops. 

# Here to use 100 runs 
# Note that in order to use a cellular magic function, it has to be the first 
# line in the cell

%%timeit -n 100
total=0
for number in numbers:
  total+=number
print(total/len(numbers))

496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
496.8555
4

In [None]:
%%timeit -n 100
total=np.sum(numbers)
total/len(numbers)

100 loops, best of 5: 58.4 µs per loop


In [None]:
# With broadcasting, you can apply an operation to every value in the series, 
# changing the series. 
# If we wanted to increase every random variable by 2, we could do so quickly 
# using the += operator directly on the Series object. 

# Let's look at the head of our series
numbers.head()

0    797
1    174
2    909
3    263
4    747
dtype: int64

In [None]:
# And now lets just increase everything in the series by 2
numbers+=2
numbers.head()

0    799
1    176
2    911
3    265
4    749
dtype: int64

In [None]:
#The .loc attribute lets you not only modify data in place, but also add new data as well. 
# If the value you pass in as the index doesn't exist, then a new entry is added.

# Series of few numbers

s=pd.Series([1,2,3])

# Add some new value
s.loc['History']=102

s

0            1
1            2
2            3
History    102
dtype: int64

In [None]:
# An example where index values are not unique, and this makes pandas Series a
# little different conceptually then, for instance, a relational database.

# Lets create a Series with students and the courses which they have taken

students_classes=pd.Series({'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'})
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [None]:
# lets create a Series just for some new student Kelly, which lists all of the courses
# she has taken. 
# We'll set the index to Kelly, and the data to be the names of courses

kelly_classes=pd.Series(['Philosophy','Arts','Math'],index=['Kelly','Kelly','Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [None]:
# Append all of the data in this new Series to the first using the .append() function.

all_students_classes = students_classes.append(kelly_classes)

# This creates a series which has our original people in it as well as all of Kelly's courses
all_students_classes

Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [None]:
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

## DataFrame Data Structure


In [None]:
# Lets import Pandas
import pandas as pd

In [None]:
# Create series of school records for students
record1 = pd.Series({'Name':'Alice','Class':'Physics','Score':85})
record2 = pd.Series({'Name':'Jack','Class':'Chemistry','Score':82})
record3 = pd.Series({'Name':'Helen','Class':'Biology','Score':90})

In [None]:
# Dataframe is index. Group of series
# Each series represent a row of data

df = pd.DataFrame([record1, record2, record3], index=['school1','school2','school1'])

df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [None]:
# DataFrame is 2 dimensional
# Extract data using .iloc or .loc attributes

df.loc['school2']

Name          Jack
Class    Chemistry
Score           82
Name: school2, dtype: object

In [None]:
# Check the data type

type(df.loc['school2'])

pandas.core.series.Series

In [None]:
# Alternative method: list of dictionary. Each dictionary is a list

students = [{'Name':'Alice','Class':'Physics','Score':85},{'Name': 'Jack',
             'Class': 'Chemistry',
             'Score': 82},
            {'Name': 'Helen',
             'Class': 'Biology',
             'Score': 90}]

# Pass this list of dictionaries into a data frame function

df = pd.DataFrame(students, index=['School1','School2','School1'])

df.head()


Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School2,Jack,Chemistry,82
School1,Helen,Biology,90


In [None]:
# We can extract data using the .iloc and .loc attributes. 
# Because the DataFrame is two-dimensional, passing a single value to the loc indexing 
# operator will return the series if there's only one row to return.

df.loc['School2']

Name          Jack
Class    Chemistry
Score           82
Name: School2, dtype: object

In [None]:
type(df.loc['School2'])

pandas.core.series.Series

In [None]:
# Indices and column names along either axes horizontal or vertical, could be non-unique. 
# In this example, we see two records for school1 as different rows.
# If we use a single value with the DataFrame lock attribute, multiple rows of the DataFrame will 
# return, not as a new series, but as a new DataFrame.

df.loc['School1']

Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School1,Helen,Biology,90


In [None]:
# The type is different

type(df.loc['School1'])

pandas.core.frame.DataFrame

In [None]:
# For instance, if we are only interested in school1's student names

df.loc['School1','Name']

School1    Alice
School1    Helen
Name: Name, dtype: object

In [None]:
# We could transpose the matrix. This pivots all of the rows into columns
# and all of the columns into rows, and is done with the T attribute

df.T

Unnamed: 0,School1,School2,School1.1
Name,Alice,Jack,Helen
Class,Physics,Chemistry,Biology
Score,85,82,90


In [None]:
# Then we can call .loc on the transpose to get the student names only

df.T.loc['Name']

School1    Alice
School2     Jack
School1    Helen
Name: Name, dtype: object

In [None]:
# Since iloc and loc are used for row selection, Panda reserves the indexing operator 
# directly on the DataFrame for column selection. 
# In a Panda's DataFrame, columns always have a name. 
# So this selection is always label based

df['Name']

School1    Alice
School2     Jack
School1    Helen
Name: Name, dtype: object

In [None]:
# Note too that the result of a single column projection is a Series object

type(df['Name'])

pandas.core.series.Series

In [None]:
# Result of using the indexing operator is either a DataFrame or Series, you can chain 
# operations together. 

# For instance, we can select all of the rows which related to school1 using
# .loc, then project the name column from just those rows

df.loc['School1']['Name']



School1    Alice
School1    Helen
Name: Name, dtype: object

In [None]:
# See the type of the resuls

print(type(df.loc['School1'])). # This is a dataframe
print(type(df.loc['School1']['Name'])) # This is Series

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [None]:
# Here's another approach. 
# As we saw, .loc does row selection, and it can take two parameters, 
# the row index and the list of column names. The .loc attribute also supports slicing.

# If we wanted to select all rows, we can use a colon to indicate a full slice
# Then we can add the column name as the second parameter as a string. 
# If we wanted to include multiple columns, we could do so in a list. 
# and Pandas will bring back only the columns we have asked for.

# Here's an example, where we ask for all the names and scores for all schools 

df.loc[:,['Name','Score']]


Unnamed: 0,Name,Score
School1,Alice,85
School2,Jack,82
School1,Helen,90


In [None]:
# Dropping data

# drop function
# This function takes a single parameter, which is the index or row label, to drop. 
# The drop function doesn't change the DataFrame by default! Instead,
# the drop function returns to you a copy of the DataFrame with the given rows removed.

df.drop('School1')

Unnamed: 0,Name,Class,Score
School2,Jack,Chemistry,82


In [None]:
# Lets look the original dataframe

df

Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School2,Jack,Chemistry,82
School1,Helen,Biology,90


In [None]:
# Parameters of drop function

# The first is called inplace, and if it's 
# set to true, the DataFrame will be updated in place, instead of a copy being returned. 
# The second parameter is the axes, which should be dropped. 
# By default, this value is 0, indicating the row axis. 
# But you could change it to 1 if you want to drop a column.

# lets make a copy using .copy()

copy_df = df.copy()

# Lets drop the name column in this copy

copy_df.drop("Name",inplace=True,axis=1) 

copy_df

Unnamed: 0,Class,Score
School1,Physics,85
School2,Chemistry,82
School1,Biology,90


In [None]:
# Another way to drop a column
# Del operator: This way of dropping data, however, takes immediate effect 
# on the DataFrame and does not return a view.

del copy_df['Class']
copy_df

Unnamed: 0,Score
School1,85
School2,82
School1,90


In [None]:
# Adding a new column to the DataFrame is as easy as assigning it to some value using
# the indexing operator. 

# For instance, if we wanted to add a class ranking column with default 
# value of None

df['ClassRanking']=None
df


Unnamed: 0,Name,Class,Score,ClassRanking
School1,Alice,Physics,85,
School2,Jack,Chemistry,82,
School1,Helen,Biology,90,


### DataFrame Indexing and Loading

In [None]:
# Quick shell command

# The Jupyter notebooks use ipython as the kernel underneath, which provides convenient ways to integrate lower level shell commands, which are
# programs run in the underlying operating system. 

# I want to use one shell command here called "cat", for "concatenate", which just 
# outputs the contents of a file. 
# In ipython if we prepend the line with an exclamation mark it will execute the remainder of the line as a shell
# command.  

#So lets look at the content of a CSV file
!cat Admission_Predict.csv