# Extra Content

In [22]:
import pandas as pd
import numpy as np

# Python: One More Data Structure  

## Collections Data Structures (standard):


Data Structure| Desc
----|------|
Lists| Heterogeneous **ordered** sequence of elements|
Tuples|Heterogeneous **Immutable ordered** sequence of elements
Dictionaries| Unordered collection stored as **Key-Value** pair
Set| Unordered collection of **unique** elements 

## Collections Data Structures (Additional):

Data Structure| Desc
----|------|
Numpy Arrays| Homogeneous sequence of elements in N-dimensional (arrays, matrices operations)|
Pandas Series| One dimensional **labeled** indexed array 
Pandas DataFrame| Multi-Index two-dimensional array (rows and columns)



# Set - Unordered collections of unique elements

In [1]:
# Example of a list
names = ["Melvin", "Jack", "Smith", "Susan", "Samantha", "Mary", "Smith", "Melvin"]
print(names)
for person in names:
    print(person, end="\n")

['Melvin', 'Jack', 'Smith', 'Susan', 'Samantha', 'Mary', 'Smith', 'Melvin']
Melvin
Jack
Smith
Susan
Samantha
Mary
Smith
Melvin


In [3]:
# Convert a list to a set with set()
name_set = set(names)
print(name_set)

{'Jack', 'Susan', 'Mary', 'Samantha', 'Smith', 'Melvin'}


#### More `set` examples

In [4]:
class_data = ["Melvin", "Jack", "Smith", "Susan", "Samantha", "Mary", "Smith", "Melvin"]
class_coding = ["Jack", "Smith", "Ali", "Anish", "Hilary", "Cesar"]

print(f"number of students in Class Data = {len(class_data)}\nnumber of students in Class Coding = {len(class_coding)}")

number of students in Class Data = 8
number of students in Class Coding = 6


In [5]:
class_data = set(class_data)
class_coding = set(class_coding)

print(f"number of students in Class Data = {len(class_data)}\nnumber of students in Class Coding = {len(class_coding)}")


number of students in Class Data = 6
number of students in Class Coding = 6


In [6]:
combine_uniqe = set.union(class_data, class_coding)
combine_uniqe

{'Ali',
 'Anish',
 'Cesar',
 'Hilary',
 'Jack',
 'Mary',
 'Melvin',
 'Samantha',
 'Smith',
 'Susan'}

In [7]:
class_data.union(class_coding)

{'Ali',
 'Anish',
 'Cesar',
 'Hilary',
 'Jack',
 'Mary',
 'Melvin',
 'Samantha',
 'Smith',
 'Susan'}

In [8]:
class_intersection = set.intersection(class_data, class_coding)
class_intersection

{'Jack', 'Smith'}

In [9]:
class_data.intersection(class_coding)

{'Jack', 'Smith'}

In [10]:
class_data.difference(class_coding)

{'Mary', 'Melvin', 'Samantha', 'Susan'}

## Unpacking 

This is a technique of quickly assigning values, it does not matter what type of itteration you use

In [11]:
# with Sets
first_name, last_name = ["Jack", "Smith"]
print(first_name)
print(type(last_name))

Jack
<class 'str'>


In [14]:
# with Tuples
first_name, last_name = ("Jack", "Smith")
print(first_name)
print(type(last_name))

Jack
<class 'str'>


In [15]:
store, sales = ["0012", [23,45,6,19,90]]

print(f"{store} store had a total sale of ${sum(sales)}")
print(type(store))

0012 store had a total sale of $183
<class 'str'>


In [16]:
def multiply(x,y):
    """
    x: numeric value
    y: numeric value
    returns x * y
    """
    return x*y

multiply(3,4)

12

In [None]:
multiply([5,6])

In [18]:
multiply(*[5,6])

30

The * symbol unpacks the list for use in the function

In [18]:
def multiply(x,y,z):
    return x*y*z

foo = [2,3,4]
multiply(*foo)

24

## `zip()` Function

In [26]:
students = ["James", "Smith", "Mark", "Mike", "Justing"]
score = [98,100,80,79,88]
grade = ["A+", "A+", "B-", "C+", "B+"]

class_combined = list(zip(students, score, grade))

print(class_combined)
    

[('James', 98, 'A+'), ('Smith', 100, 'A+'), ('Mark', 80, 'B-'), ('Mike', 79, 'C+'), ('Justing', 88, 'B+')]


In [32]:
for i in class_combined:
    print(i[1])

98
100
80
79
88


In [23]:
df = pd.DataFrame(class_combined)
df

Unnamed: 0,0,1,2
0,James,98,A+
1,Smith,100,A+
2,Mark,80,B-
3,Mike,79,C+
4,Justing,88,B+


In [24]:
df = pd.DataFrame(class_combined, columns=["Name", "Score", "Grade"])
df

Unnamed: 0,Name,Score,Grade
0,James,98,A+
1,Smith,100,A+
2,Mark,80,B-
3,Mike,79,C+
4,Justing,88,B+


# For Loops One More time: Looping with `Enumerate()`

This will return an index inaddition to looping through a list

In [33]:
# To get index and value from a list we can use enumerate
names = ["James", "Smith", "Mark", "Mike", "Justing"]

for i, name in enumerate(names):
    print(i, name)

0 James
1 Smith
2 Mark
3 Mike
4 Justing


In [34]:
# We can change inital starting point for enumerate, default is zero
for i, name in enumerate(names, 1):
    print(i, name)

1 James
2 Smith
3 Mark
4 Mike
5 Justing


In [35]:
indx = names.index("Mark")
indx

2

# Pands Creating a DataFrame with `read_clipboard()`

In [36]:
import webbrowser
website = "https://en.wikipedia.org/wiki/List_of_all-time_NFL_win–loss_records"
webbrowser.open(website)

True

In [42]:
df_wikipedia = pd.read_clipboard()

In [43]:
type(df_wikipedia)

pandas.core.frame.DataFrame

In [44]:
df_wikipedia.head()

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,32,Tampa Bay Buccaneers,676,260,415,1,0.385,1976,NFC South
1,31,Arizona Cardinals,1346,553,753,40,0.426,1920,NFC West
2,30,Atlanta Falcons,816,358,452,6,0.442,1966,NFC South
3,29,Jacksonville Jaguars,384,170,214,0,0.443,1995,AFC South
4,28,Houston Texans,272,121,151,0,0.445,2002,AFC South


In [45]:
df_wikipedia["Team"].head()

0    Tampa Bay Buccaneers
1       Arizona Cardinals
2         Atlanta Falcons
3    Jacksonville Jaguars
4          Houston Texans
Name: Team, dtype: object

In [46]:
df_wikipedia.Team.head()

0    Tampa Bay Buccaneers
1       Arizona Cardinals
2         Atlanta Falcons
3    Jacksonville Jaguars
4          Houston Texans
Name: Team, dtype: object

In [47]:
# DataFrame Index is a sequence (range) from 0 to 31 similar to using range(0,32,1)
df_wikipedia.index

RangeIndex(start=0, stop=32, step=1)

In [48]:
df_wikipedia.set_index('Team', inplace=True)
df_wikipedia.head()

Unnamed: 0_level_0,Rank,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Tampa Bay Buccaneers,32,676,260,415,1,0.385,1976,NFC South
Arizona Cardinals,31,1346,553,753,40,0.426,1920,NFC West
Atlanta Falcons,30,816,358,452,6,0.442,1966,NFC South
Jacksonville Jaguars,29,384,170,214,0,0.443,1995,AFC South
Houston Texans,28,272,121,151,0,0.445,2002,AFC South


In [49]:
# List DataFrame Index
df_wikipedia.index

Index(['Tampa Bay Buccaneers', 'Arizona Cardinals', 'Atlanta Falcons',
       'Jacksonville Jaguars', 'Houston Texans', 'New York Jets',
       'New Orleans Saints', 'Cincinnati Bengals', 'Detroit Lions',
       'Buffalo Bills', 'Tennessee Titans', 'Cleveland Browns',
       'Philadelphia Eagles', 'Carolina Panthers', 'Los Angeles Rams',
       'Los Angeles Chargers[d]', 'Washington Redskins', 'Seattle Seahawks',
       'Oakland Raiders', 'Kansas City Chiefs', 'Pittsburgh Steelers',
       'Indianapolis Colts[c]', 'San Francisco 49ers', 'Denver Broncos',
       'New York Giants', 'Baltimore Ravens', 'Minnesota Vikings',
       'Miami Dolphins', 'New England Patriots[b]', 'Green Bay Packers',
       'Chicago Bears', 'Dallas Cowboys'],
      dtype='object', name='Team')

In [50]:
df_wikipedia.loc["Dallas Cowboys"]

Rank                       1
GP                       898
Won                      512
Lost                     380
Tied                       6
Pct.                   0.573
First NFL Season        1960
Division            NFC East
Name: Dallas Cowboys, dtype: object

In [51]:
df_wikipedia.loc["Dallas Cowboys", "Won"]

512

**Another example using `read_clipboard()` with excel data**

In [52]:
df_excel = pd.read_clipboard()
df_excel.head()

Unnamed: 0,Rank,Team,GP,Won,Lost,Tied,Pct.,First NFL Season,Division
0,32,Tampa Bay Buccaneers,676,260,415,1,0.385,1976,NFC South
1,31,Arizona Cardinals,1346,553,753,40,0.426,1920,NFC West
2,30,Atlanta Falcons,816,358,452,6,0.442,1966,NFC South
3,29,Jacksonville Jaguars,384,170,214,0,0.443,1995,AFC South
4,28,Houston Texans,272,121,151,0,0.445,2002,AFC South


In [53]:
df_excel.columns

Index(['Rank', 'Team', 'GP', 'Won', 'Lost', 'Tied', 'Pct.', 'First NFL Season',
       'Division'],
      dtype='object')

In [54]:
df_excel.Utilities.value_counts()

AttributeError: 'DataFrame' object has no attribute 'Utilities'

## Creating Series with an Index

In [68]:
gdp_per_capita = pd.Series([59939,8612,38214,44680,1980,39532,39827,9881,32038,44841,10846,29958], 
                           index=["United States", "China", "Japan", "Germany", "India", "United Kingdom", "France","Brazil", "Italy", "Canada", "Russia", "South Korea"])

In [56]:
gdp_per_capita

United States     59939
China              8612
Japan             38214
Germany           44680
India              1980
United Kingdom    39532
France            39827
Brazil             9881
Italy             32038
Canada            44841
Russia            10846
South Korea       29958
dtype: int64

In [57]:
gdp_per_capita.min()

1980

In [58]:
gdp_per_capita.idxmin()

'India'

In [59]:
gdp_per_capita["China"]

8612

In [60]:
gdp_per_capita[1]

8612

In [61]:
gdp_per_capita.shape

(12,)

In [62]:
gdp_per_capita.index

Index(['United States', 'China', 'Japan', 'Germany', 'India', 'United Kingdom',
       'France', 'Brazil', 'Italy', 'Canada', 'Russia', 'South Korea'],
      dtype='object')

In [63]:
gdp_per_capita.values

array([59939,  8612, 38214, 44680,  1980, 39532, 39827,  9881, 32038,
       44841, 10846, 29958], dtype=int64)

In [64]:
type(gdp_per_capita.index)

pandas.core.indexes.base.Index

In [65]:
type(gdp_per_capita.values)

numpy.ndarray