In [1]:
import pandas as pd
import numpy as np

In [2]:
#The first core object of pandas is the series. A series is a one-dimensional array of indexed data.

#A pandas.Series having an index is the main difference between a pandas.Series and a NumPy array. Let’s see the difference:

# A numpy array
arr = np.random.randn(4) # random values from std normal distribution
print(type(arr))
print(arr, "\n")

# A pandas series made from the previous array
s = pd.Series(arr)
print(type(s))
print(s)

#Notice the index is printed as part of the pandas.Series while, although the np.array is indexable, 
#the index is not part of this data structure. Printing the pandas.Series also shows the values and their data type.

<class 'numpy.ndarray'>
[ 0.49822649  0.01485332  0.1052539  -0.20120628] 

<class 'pandas.core.series.Series'>
0    0.498226
1    0.014853
2    0.105254
3   -0.201206
dtype: float64


In [3]:
# The basic method to create a pandas.Series is to call
#s = pd.Series(data, index=index)
#The data parameter can be:

#a list or NumPy array,
#a Python dictionary, or
#a single number, boolean (True/False), or string.
#The index parameter is optional, if we wish to include it, it must be a list of list of indices of the same length as data.

In [4]:
# A series from a numpy array 
pd.Series(np.arange(3), index=[2023, 2024, 2025])

2023    0
2024    1
2025    2
dtype: int64

In [5]:
# A series from a list of strings with default index
pd.Series(['EDS 220', 'EDS 222', 'EDS 223', 'EDS 242'])

0    EDS 220
1    EDS 222
2    EDS 223
3    EDS 242
dtype: object

In [6]:
#Example: Creating a pandas.Series from a dictionary
#Recall that a dictionary is a set of key-value pairs. If we create a pandas.Series via a dictionary the keys will become 
#the index and the values the corresponding data.

# Construct dictionary
d = {'key_0':2, 'key_1':'3', 'key_2':5}

# Initialize series using a dictionary
pd.Series(d)

key_0    2
key_1    3
key_2    5
dtype: object

In [7]:
pd.Series(3.0, index = ['A', 'B', 'C'])

A    3.0
B    3.0
C    3.0
dtype: float64

In [8]:
# Define a series
s = pd.Series([98,73,65],index=['Andrea', 'Beth', 'Carolina'])

# Divide each element in series by 10
print(s /10, '\n')

# Take the exponential of each element in series
print(np.exp(s), '\n')

# Original series is unchanged
print(s)

Andrea      9.8
Beth        7.3
Carolina    6.5
dtype: float64 

Andrea      3.637971e+42
Beth        5.052394e+31
Carolina    1.694889e+28
dtype: float64 

Andrea      98
Beth        73
Carolina    65
dtype: int64


In [9]:
#We can also produce new pandas.Series with True/False values indicating whether the elements in a series satisfy a condition or not:

s > 70

Andrea       True
Beth         True
Carolina    False
dtype: bool

In [10]:
#In pandas we can represent a missing, NULL, or NA value with the float value numpy.nan, which stands for “not a number”. 
#Let’s construct a small series with some NA values represented this way:

# Series with NAs in it
s = pd.Series([1, 2, np.nan, 4, np.nan])
s
#Notice the data type of the values it he series is still float64.

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
dtype: float64

In [11]:
#The hasnans attribute for a pandas.Series returns True if there are any NA values in it and false otherwise:

# Check if series has NAs
s.hasnans

s.isna()
#The ouput is a pandas.Series of boolean values indicating if an element in the row at 
#the given index is np.nan (True = is NA) or not (False = not NA).

0    False
1    False
2     True
3    False
4     True
dtype: bool

# Check in 1

In [12]:
#The integer number -999 is often used to represent missing values. Create a pandas.Series named s with four integer values,
#two of which are -999. The index of this series 
#should be the the letters A through D.

data = [69, -999, 420, -999]
index = ["A", "B", "C", "D"]
series = pd.Series(data, index=index)

print(series)

#In the pandas.Series documentation, look for the method mask(). Use this method to update the series s so that the -999 values are
#replaced by NA values. HINT: check the first example in the method’s documentation.

series = series.mask(series == -999, pd.NA)

print(series)

A     69
B   -999
C    420
D   -999
dtype: int64
A     69.0
B      NaN
C    420.0
D      NaN
dtype: float64


In [13]:
#The pandas.DataFrame is the most used pandas object. It represents tabular data and we can think of it as a spreadhseet. 
#Each column of a pandas.DataFrame is a pandas.Series.

#There are many ways of creating a pandas.DataFrame. We present one simple one in this section.

#We already mentioned each column of a pandas.DataFrame is a pandas.Series. 
#In fact, the pandas.DataFrame is a dictionary of pandas.Series, with each column name being the key and the column values being the key’s value. 
#Thus, we can create a pandas.DataFrame in this way:

# Initialize dictionary with columns' data 
d = {'col_name_1' : pd.Series(np.arange(3)),
     'col_name_2' : pd.Series([3.1, 3.2, 3.3]),
     }

# Create data frame
df = pd.DataFrame(d)
df

# Change index
df.index = ['a','b','c']
df


Unnamed: 0,col_name_1,col_name_2
a,0,3.1
b,1,3.2
c,2,3.3


# Check in 2

In [14]:
#We can access the data frame’s column names via the columns attribute. Update the column names to C1 and C2 by updating this attribute.

# Rename multiple columns
df = df.rename(columns={'col_name_1': 'C1', 'col_name_2': 'C2'})

df


Unnamed: 0,C1,C2
a,0,3.1
b,1,3.2
c,2,3.3


# Lesson Summary

Most important concepts: The difference between a Series and a Dataframe. The concept of Indexes. Reviewing both this 
worksheet and my Python summer class notes was extremely helpful for comprehension and completion. 