## Deep Dive: Data Structures
- Lists, Tuples, Sets, and Dictionaries

In [1]:
# Example of a list
l = [1, 3, 4.9, "name", 3]

# Example of a tuple
t = (1, 3, 4.9, "name", 3)

# Example of a set
s = {1, 3, 4.9, "name", 3}

# Example of a dictionary
d = {23: "twentythree", "B": 43, "C":"CCD"}

In [2]:
print(f"The Type of L is {type(l)}")
print(f"The Type of L is {type(t)}")
print(f"The Type of L is {type(s)}")
print(f"The Type of L is {type(d)}")

In [3]:
# Accessing elements in each data structure:
# Returns index value
print(l[1])
# Returns index value
print(t[1])
# We can only see if value is contained in set as there is no defined order in sets
print(3 in s)
# Return the value for the key entered
print(d[23])

In [4]:
# Accessing a slice of a list - follows same rules as indexing/slicing strings
l[1:3]

In [5]:
# Accessing a slice of a tuple - works same as lists but immutable
t[1:3]

In [6]:
# Adding elements to a list using concatenate
print(l)
l = l + ["how", "are", "you"]

# Adding more elements to a list using the append() function
print(l)
l.append("NewElement")
print(l)

# Removing an element from a list
l.remove("NewElement")
print(l)

In [7]:
# Changing value of elements in a list
print(l)
l[0] = "ReplacementElement"
print(l)

[1, 3, 4.9, 'name', 3, 'how', 'are', 'you']
['ReplacementElement', 3, 4.9, 'name', 3, 'how', 'are', 'you']


In [8]:
# Adding two tuples together - we cannot change tuples but we can merge
t2 = (1, 2, 3)
t3 = (4, 5, 6)
t4 = t2 + t3
print(t4)

(1, 2, 3, 4, 5, 6)


In [9]:
# Adding to sets using the add() function
print(s)
s.add(46)
print(s)

# Adding to sets using the update() method - allows us to add multiple elements
s.update({23, "game", 1})
print(s)

{'name', 1, 3, 4.9}
{1, 3, 'name', 4.9, 46}
{1, 3, 'name', 4.9, 'game', 46, 23}


In [10]:
# Adding to dictionaries
print(d)
d["NewKey"] = "NewValue"
print(d)

{23: 'twentythree', 'B': 43, 'C': 'CCD'}
{23: 'twentythree', 'B': 43, 'C': 'CCD', 'NewKey': 'NewValue'}


In [11]:
# Removing elements from a list using del function
print(l)
del l[0]
print(l)

['ReplacementElement', 3, 4.9, 'name', 3, 'how', 'are', 'you']
[3, 4.9, 'name', 3, 'how', 'are', 'you']


In [12]:
# Removing elements from sets and dictionaries
print(s)
s.remove("game")
print(s)

print(d)
del d[23]
print(d)

{1, 3, 'name', 4.9, 'game', 46, 23}
{1, 3, 'name', 4.9, 46, 23}
{23: 'twentythree', 'B': 43, 'C': 'CCD', 'NewKey': 'NewValue'}
{'B': 43, 'C': 'CCD', 'NewKey': 'NewValue'}


In [13]:
# Using the copy() function on lists - allows us to create lists independent 
# of one another - Works for sets and dictionaries as well

print(l)
l2 = l
print(l2)
l2[2] = "UpdatedValue"
print(l)
print(l2)

l3 = l.copy()
print(l3)
l3[0] = "NewUpdatedElmt"
print(l3)
print(l)

[3, 4.9, 'name', 3, 'how', 'are', 'you']
[3, 4.9, 'name', 3, 'how', 'are', 'you']
[3, 4.9, 'UpdatedValue', 3, 'how', 'are', 'you']
[3, 4.9, 'UpdatedValue', 3, 'how', 'are', 'you']
[3, 4.9, 'UpdatedValue', 3, 'how', 'are', 'you']
['NewUpdatedElmt', 4.9, 'UpdatedValue', 3, 'how', 'are', 'you']
[3, 4.9, 'UpdatedValue', 3, 'how', 'are', 'you']


In [14]:
# If you create a list based on a slice of another list, it is automatically
# a copy - independent list so changes are not reflected in both lists
l3 = l[0:2]
print(l[0:2])
print(l3)
l3[0] = "New"
print(l3)
print(l[0:2])

[3, 4.9]
[3, 4.9]
['New', 4.9]
[3, 4.9]


## Practice problem

In [15]:
"""Lets say you are a teacher and you have different student records containing ids of students and 
the marks list in each subject where different students ahve taken different number of subjects. All
of these records are in a hard copy. You want to enter all of the data into your computer and 
calculate the average marks of each student then display that."""

# Creating the function to enter student info
def get_data_from_teacher():
    studentdict = {}
    while True:
        student_id = input("Enter Student ID: ")
        marks_list = input("Enter the marks as comma separated values: ")
        more_students = input("Would you like to add more marks? Type 'No' to quit: ")
        if student_id in studentdict:
            print(f"Student ID {student_id} is already inserted")
        else:
            studentdict[student_id] = marks_list.split(",")
        if more_students.lower() == "no":
            return studentdict

In [16]:
# Calling the function to enter data
student_data = get_data_from_teacher()

Enter Student ID: 8
Enter the marks as comma separated values: 78,88,85
Would you like to add more marks? Type 'No' to quit: 1,
Enter Student ID: 1
Enter the marks as comma separated values: 82,84,86
Would you like to add more marks? Type 'No' to quit: j
Enter Student ID: 7
Enter the marks as comma separated values: 97,84,84
Would you like to add more marks? Type 'No' to quit: no


In [17]:
# Displaying data entered
student_data

{'8': ['78', '88', '85'], '1': ['82', '84', '86'], '7': ['97', '84', '84']}

In [18]:
# Creating a function to get average marks of students based on entered data
def get_average_marks(D):
    avg_marks = {}
    for x in D:
        L = D[x]
        s = 0
        for marks in L:
            s += int(marks)
        avg_marks[x] = s/len(L)
    return avg_marks

In [19]:
# Calling the average marks info entered
avg = get_average_marks(student_data)

In [20]:
# Printing out the averages
for x in avg:
   print(f"Student {x} got avg marks as: {avg[x]}") 

Student 8 got avg marks as: 83.66666666666667
Student 1 got avg marks as: 84.0
Student 7 got avg marks as: 88.33333333333333


# Working with Numpy

In [21]:
# Must first import numpy
import numpy as np

In [22]:
# Creating the numpy arrays and changing the type of one array
a = np.array([1,2,3,5,7])
b = np.array((2,3,5), dtype="f")

In [23]:
# Printing np array as list
print(a)

[1 2 3 5 7]


In [24]:
type(a)

numpy.ndarray

In [25]:
type(b)

numpy.ndarray

In [26]:
a.dtype

dtype('int32')

In [27]:
b.dtype

dtype('float32')

In [28]:
# Understanding Numpy dimensions - below is an array with two dimensions
a = np.array([[1,2,3], [4,5,6]])
# Below is a 3 dimensional array
b = np.array([[[1,2,3], [4,5,6]], [[-1,-2,-3], [-4,-5,-6]]])

In [29]:
# Seeing the dimensions of the arrays - Arrays must have the same number of elements
print(a.ndim)
print(b.ndim)

2
3


In [30]:
# Accessing the value "3" in array "a"
print(a[0,2])

3


In [31]:
# Accessing the value "-2" in array "b"
print(b[1,0,1])

-2


In [32]:
# Looking at the shape of our np array "b"
print(b.shape)

(2, 2, 3)


In [33]:
# Looking at the total number of elements in the "b" array
print(b.size)

12


## Looking at some numpy functions - np.arange, reshape, random


In [34]:
# arange creates a 1D array up to number indicated in parentheses - very useful for creating iterators
# used in for loops

# Creating an array up to value 24
ex = np.arange(25)
print(ex)

# Creating an array up to 24 starting at 12
ex = np.arange(12,25)
print(ex)

# Creating an array up to 24 starting at 12 taking skipping every other number
ex = np.arange(12,25,2)
print(ex)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
[12 13 14 15 16 17 18 19 20 21 22 23 24]
[12 14 16 18 20 22 24]


In [35]:
# Using the random library to generate a randomly arranged array of the numbers 0 to 10
a = np.random.permutation(np.arange(10))
print(a)

# Printing a random integer between the array 0-100
print(np.random.randint(0, 100))

[3 5 0 4 1 8 7 2 6 9]
61


In [36]:
# Using the reshape function to adjust elements in an array into different dimensional arrays - 
# useful for testing matrices

# Generating a 2 dimensional array from 100 values - 4 rows and 25 columns
a = np.arange(100).reshape(4,25)
print(a)

[[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  24]
 [25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
  49]
 [50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
  74]
 [75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
  99]]


In [37]:
# Confirming that the shape of this array is 2 dimensional array
a.shape

(4, 25)

## Numpy (Slicing)

In [38]:
# Follows syntax: A[start:end:step]
# Changes made to numpy arrays affect both the original array and the new array, even if it is a slice
a = np.arange(100)
b = a[3:10]
print(b)

[3 4 5 6 7 8 9]


In [39]:
# Changing value in slice of b will affect both array b and array a
b[0] = -12000
print(b)
print(a)

[-12000      4      5      6      7      8      9]
[     0      1      2 -12000      4      5      6      7      8      9
     10     11     12     13     14     15     16     17     18     19
     20     21     22     23     24     25     26     27     28     29
     30     31     32     33     34     35     36     37     38     39
     40     41     42     43     44     45     46     47     48     49
     50     51     52     53     54     55     56     57     58     59
     60     61     62     63     64     65     66     67     68     69
     70     71     72     73     74     75     76     77     78     79
     80     81     82     83     84     85     86     87     88     89
     90     91     92     93     94     95     96     97     98     99]


In [40]:
# In order to create a separate array we must use the copy() function
b = a[3:10].copy
print(b)

<built-in method copy of numpy.ndarray object at 0x00000252FECEBDF0>


In [41]:
# Stepping through "a" array by every 5th element
a[::5]

array([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80,
       85, 90, 95])

In [42]:
# Reverse stepping through entire array by every 5th element
a[::-5]

array([99, 94, 89, 84, 79, 74, 69, 64, 59, 54, 49, 44, 39, 34, 29, 24, 19,
       14,  9,  4])

In [43]:
# Finding the index value of -12000 in array "a"
idx = np.argwhere(a == -12000)[0][0]
print(idx)

3


In [44]:
# Creating a 2 dimensional array
a = np.round(10 * np.random.rand(5,4))
print(a)

[[5. 5. 0. 9.]
 [6. 8. 4. 3.]
 [2. 0. 3. 6.]
 [0. 3. 4. 1.]
 [1. 5. 8. 4.]]


In [45]:
# Accessing/slicing value "1" in second row
print(a[1,1])

8.0


In [46]:
# Accessing/slicing entire second row
print(a[1,:])

[6. 8. 4. 3.]


In [47]:
# Accessing/slicing the second column
print(a[:,1])

[5. 8. 0. 3. 5.]


In [48]:
# Slicing a sub matrix
print(a[1:3,2:4])

[[4. 3.]
 [3. 6.]]


### Using Index Arrays

In [49]:
# Creating a new array
a = np.arange(100)

# We can also index an array by passing in a list containing index values that we want to access/slice
# Accessing the 1st, 4th, and 2nd elements
print(a[[1, 4, 2]])

[1 4 2]


In [50]:
# Using a boolean array with a condition such as a<8 will return values where the condition is true
b = a[a <24]
print(b)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]


In [51]:
c = np.arange(8)
# We can slice an array by passing in a boolean array which returns values where our boolean = true
c[[True, True, False, True, False, False, True, True]]

array([0, 1, 3, 6, 7])

In [58]:
# Creating an np.array and calling it
a = np.round(10 * np.random.rand(2,3))
print(a)

[[ 6.  5.  6.]
 [10.  4.  4.]]


In [59]:
# Reshaping created array
a + np.arange(2).reshape(2,1)

array([[ 6.,  5.,  6.],
       [11.,  5.,  5.]])

In [60]:
# Creating a new array and calling it
b = np.round(10* np.random.rand(2, 2))
print(b)

[[ 5.  5.]
 [10.  8.]]


In [63]:
# Concatenating our two arrays using horizontal stack
c = np.hstack((a,b))
print(c)

[[ 6.  5.  6.  5.  5.]
 [10.  4.  4. 10.  8.]]


In [66]:
# Sorting a new array
a = np.random.permutation(np.arange(10))
print(a)
a.sort()
print(a)

[7 6 2 8 4 5 1 3 0 9]
[0 1 2 3 4 5 6 7 8 9]


# Pandas

## Series

In [67]:
# Must first import pandas
import pandas as pd

In [68]:
# Creating a pandas series - 1 dimensional
a = pd.Series([2,3,4,5], index=["a", "b", "c", "d"])

In [70]:
# Calling back the values in array
a.values

array([2, 3, 4, 5], dtype=int64)

In [72]:
# Looking at the type of the array - Pandas is built on top of numpy so it should be an np array
type(a.values)

numpy.ndarray

In [73]:
# Looking at the array index
a.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [74]:
# Accessing array values
a[2]

4

In [75]:
# Slicing a pandas series
a["a":"c"]

a    2
b    3
c    4
dtype: int64

In [77]:
# Converting a dictionary into a Pandas series
grades_dict = {"A": 4, "B": 3.5, "C":3, "D":2.5}
grades = pd.Series(grades_dict)

array([4. , 3.5, 3. , 2.5])

In [78]:
# Calling back the values in the grades Series
grades.values

array([4. , 3.5, 3. , 2.5])

In [79]:
# Calling the grades index
grades.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [80]:
# Creating a new pd series from a dictionary
marks_dict = {"A": 85, "B": 75, "C": 65, "D": 55}
marks = pd.Series(marks_dict)

In [81]:
marks

A    85
B    75
C    65
D    55
dtype: int64

In [82]:
# Accessing marks based on index - explicit
marks["A"]

85

In [83]:
# Slicing based on position - implicit value of index
marks[0:2]

A    85
B    75
dtype: int64

## Dataframes

In [93]:
# Creating a DF with our marks and grades Series - more than one dimension
new_df = pd.DataFrame({"Marks": marks, "Grades": grades})
new_df

Unnamed: 0,Marks,Grades
A,85,4.0
B,75,3.5
C,65,3.0
D,55,2.5


In [94]:
# Accessing values inside of a DF
new_df.values[2, 0]

65.0

In [95]:
# Returning the column index
new_df.columns

Index(['Marks', 'Grades'], dtype='object')

In [96]:
# Adding a new column to our DF - works like a dict
new_df["ScaledMarks"] = (new_df["Marks"]/90) * 100
new_df

Unnamed: 0,Marks,Grades,ScaledMarks
A,85,4.0,94.444444
B,75,3.5,83.333333
C,65,3.0,72.222222
D,55,2.5,61.111111


In [97]:
# Removing a column
del new_df["ScaledMarks"]
new_df

Unnamed: 0,Marks,Grades
A,85,4.0
B,75,3.5
C,65,3.0
D,55,2.5


In [99]:
# Handling missing data in DataFrames - NaNs
missing = pd.DataFrame([{"a": 1, "b": 2}, {"b": 3, "c": 4}])
missing

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [101]:
# We can fill in the NaNs with a value
missing.fillna(0)

Unnamed: 0,a,b,c
0,1.0,2,0.0
1,0.0,3,4.0


In [103]:
# We can drop all NaNs - Will drop entire row if it contains NaN value
missing.dropna()

Unnamed: 0,a,b,c


##  Pandas (Indexing)

In [105]:
data = pd.Series(["A", "B", "C"], index=[1, 3, 5])

In [108]:
# Using the loc method to index Pandas Series - explicit index method
data.loc[1]

'A'

In [109]:
# Using the loc method to index Pandas Series - implicit index method
data.iloc[1]

'B'