# Jacob Sheehan
## wk12 11/07/24
### Python - Pandas Series and DataFrame

In [1]:
# Pandas is a open source Python library used for working with data sets.
# It is used for analyzing, cleaning, exploring, importing, plotting, and manipulating data.
# Fast and efficent DataFrame object


In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [5]:
print(pd.__version__)

1.0.5


In [7]:
s = pd.Series(np.arange(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [8]:
type(s)

pandas.core.series.Series

In [9]:
s.values

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [10]:
s.index


RangeIndex(start=0, stop=10, step=1)

In [11]:
# Create Labels
s = Series(np.arange(1,5), index=["w", "x", "y", "z"])
s

w    1
x    2
y    3
z    4
dtype: int32

In [12]:
s["x"]

2

In [14]:
s[1]

2

In [15]:
s["w"] + s["x"]

3

In [17]:
data = np.array(['a', 'b', 'c', 'd'])
s = Series(data, index = [100,101,102,103])
s

100    a
101    b
102    c
103    d
dtype: object

In [18]:
summer_olympic = Series([2520, 1122, 937, 847, 713], index = ['USA', 'Russia', 'Germany', 'UK', 'France'])
summer_olympic

USA        2520
Russia     1122
Germany     937
UK          847
France      713
dtype: int64

In [19]:
summer_olympic['USA']

2520

In [20]:
summer_olympic[summer_olympic > 800]

USA        2520
Russia     1122
Germany     937
UK          847
dtype: int64

In [21]:
summer_olympic > 800

USA         True
Russia      True
Germany     True
UK          True
France     False
dtype: bool

In [22]:
'USA' in summer_olympic

True

In [23]:
d = summer_olympic.to_dict()
d

{'USA': 2520, 'Russia': 1122, 'Germany': 937, 'UK': 847, 'France': 713}

In [24]:
s = Series(d)
s

USA        2520
Russia     1122
Germany     937
UK          847
France      713
dtype: int64

In [26]:
countries = ['Russia', 'France', 'Spain', 'Germany', 'USA', 'UK', 'Italy']
s = Series(d, index=countries)
s

Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
dtype: float64

In [27]:
pd.isnull(s)

Russia     False
France     False
Spain       True
Germany    False
USA        False
UK         False
Italy       True
dtype: bool

In [28]:
pd.notnull(s)

Russia      True
France      True
Spain      False
Germany     True
USA         True
UK          True
Italy      False
dtype: bool

In [29]:
s

Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
dtype: float64

In [30]:
summer_olympic + s

France     1426.0
Germany    1874.0
Italy         NaN
Russia     2244.0
Spain         NaN
UK         1694.0
USA        5040.0
dtype: float64

In [31]:
s

Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
dtype: float64

In [32]:
s.name = "Summer Olympic"
s

Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
Name: Summer Olympic, dtype: float64

In [33]:
s.index.name = 'Country'
s

Country
Russia     1122.0
France      713.0
Spain         NaN
Germany     937.0
USA        2520.0
UK          847.0
Italy         NaN
Name: Summer Olympic, dtype: float64

In [34]:
# DataFrame
# DataFrame is a 2 dimensional data structure, like a 2 dimensional array
data = {
    "calories": [420, 380, 390],
    "fruits": ['Orange', 'Banana', 'Grape']
    
}
data

{'calories': [420, 380, 390], 'fruits': ['Orange', 'Banana', 'Grape']}

In [35]:
# Load Data to a DataFrame Object
df = DataFrame(data)
df

Unnamed: 0,calories,fruits
0,420,Orange
1,380,Banana
2,390,Grape


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   calories  3 non-null      int64 
 1   fruits    3 non-null      object
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


In [37]:
df.describe

<bound method NDFrame.describe of    calories  fruits
0       420  Orange
1       380  Banana
2       390   Grape>

In [38]:
df['calories']

0    420
1    380
2    390
Name: calories, dtype: int64

In [39]:
type(df['calories'])

pandas.core.series.Series

In [40]:
df['fruits']

0    Orange
1    Banana
2     Grape
Name: fruits, dtype: object

In [41]:
df.loc[0]

calories       420
fruits      Orange
Name: 0, dtype: object

In [42]:
df.loc[[0,2]] # Return row 0 and 2

Unnamed: 0,calories,fruits
0,420,Orange
2,390,Grape


In [43]:
df[['fruits', 'calories']]

Unnamed: 0,fruits,calories
0,Orange,420
1,Banana,380
2,Grape,390


In [44]:
d = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

In [45]:
df = DataFrame(d)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [46]:
df.head(3)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6


In [47]:
DataFrame(d).head(3)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6


In [48]:
df.tail(3)

Unnamed: 0,state,year,pop
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [49]:
# Reorder the columns
df = DataFrame(df, columns=['year', 'state', 'pop'])
df

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [52]:
# Create a datframe from a dataframe
df = DataFrame(d, columns=['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 'four', 'five', 'six'])
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [53]:
df.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [54]:
df.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

In [55]:
df['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [56]:
df.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [58]:
df.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [59]:
df.loc[['three', 'five']]

Unnamed: 0,year,state,pop,debt
three,2002,Ohio,3.6,
five,2002,Nevada,2.9,


In [60]:
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [61]:
len(df)

6

In [62]:
len(df['state'])

6

In [64]:
df['debt'] = len(df['state'])
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,6
two,2001,Ohio,1.7,6
three,2002,Ohio,3.6,6
four,2001,Nevada,2.4,6
five,2002,Nevada,2.9,6
six,2003,Nevada,3.2,6


In [67]:
df['debt'] = np.arange(np.rint(len(df['state'])))
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [69]:
df['debt'] = np.random.random(6) * 10
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,3.742497
two,2001,Ohio,1.7,7.736927
three,2002,Ohio,3.6,6.874236
four,2001,Nevada,2.4,8.540514
five,2002,Nevada,2.9,8.570746
six,2003,Nevada,3.2,0.325097


In [70]:
df['debt']

one      3.742497
two      7.736927
three    6.874236
four     8.540514
five     8.570746
six      0.325097
Name: debt, dtype: float64

In [71]:
s = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
s

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [73]:
df['debt'] = s
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [74]:
df['eastern'] = df['state'] == 'Ohio'
df

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [75]:
# delete column
del df['eastern']
df

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [76]:
df[df['pop'] > 2.4]

Unnamed: 0,year,state,pop,debt
three,2002,Ohio,3.6,
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,
