# **Data Analysis with Python - 3  (27 Apr 22)**

## **Pre-Class**

### **Pandas Introduction**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

### **Series**

`Series()`: One-dimensional ndarray with axis labels (including time series).

In [30]:
ser = pd.Series(list("microsoft"))

In [31]:
ser

0    m
1    i
2    c
3    r
4    o
5    s
6    o
7    f
8    t
dtype: object

### **Data Frame Basics (Attributes)**

`.sort_index` : Sort Series by index labels

In [33]:
ser.sort_index(ascending=False)

8    t
7    f
6    o
5    s
4    o
3    r
2    c
1    i
0    m
dtype: object

`.sort_values` : Sort a Series in ascending or descending order by the values

In [35]:
ser.sort_values() # here by the alphabetical order

2    c
7    f
1    i
0    m
4    o
6    o
3    r
5    s
8    t
dtype: object

`.isin` : Return a boolean Series showing whether each element in the Series matches an element in the passed sequence of values exactly

In [40]:
ser.isin(['o'])

0    False
1    False
2    False
3    False
4     True
5    False
6     True
7    False
8    False
dtype: bool

`.keys` : return the index labels of the given series object

In [41]:
ser.keys()

RangeIndex(start=0, stop=9, step=1)

`.values` : Return Series as ndarray or ndarray-like depending on the dtype

In [43]:
ser.values

array(['m', 'i', 'c', 'r', 'o', 's', 'o', 'f', 't'], dtype=object)

`.items` : This method returns an iterable tuple (index, value)

In [44]:
list(ser.items())

[(0, 'm'),
 (1, 'i'),
 (2, 'c'),
 (3, 'r'),
 (4, 'o'),
 (5, 's'),
 (6, 'o'),
 (7, 'f'),
 (8, 't')]

`.read_csv()`: Read a comma-separated values (csv) file into DataFrame

In [47]:
dataframe3 = pd.read_csv('women-stem.csv')

`.head()`: This function returns the first n rows for the object based on position, default n=5

In [46]:
dataframe3.head()

Unnamed: 0,Rank,Major_code,Major,Major_category,Total,Men,Women,ShareWomen,Median
0,1,2419,PETROLEUM ENGINEERING,Engineering,2339,2057,282,0.120564,110000
1,2,2416,MINING AND MINERAL ENGINEERING,Engineering,756,679,77,0.101852,75000
2,3,2415,METALLURGICAL ENGINEERING,Engineering,856,725,131,0.153037,73000
3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,1258,1123,135,0.107313,70000
4,5,2418,NUCLEAR ENGINEERING,Engineering,2573,2200,373,0.144967,65000


In [50]:
df2 = pd.DataFrame(np.arange(1, 24, 2).reshape(3, 4), columns=['var1', 'var2', 'var3', 'var4'])

In [51]:
df2

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [52]:
df2.head(2)

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15


`.tail()`: This function returns last n rows from the object based on position, default n=5.

In [53]:
df2.tail(1)

Unnamed: 0,var1,var2,var3,var4
2,17,19,21,23


`.sample` : Return a random sample of items from an axis of object

In [60]:
df2.sample(n=2)

Unnamed: 0,var1,var2,var3,var4
1,9,11,13,15
0,1,3,5,7


In [61]:
dataframe3.head()

Unnamed: 0,Rank,Major_code,Major,Major_category,Total,Men,Women,ShareWomen,Median
0,1,2419,PETROLEUM ENGINEERING,Engineering,2339,2057,282,0.120564,110000
1,2,2416,MINING AND MINERAL ENGINEERING,Engineering,756,679,77,0.101852,75000
2,3,2415,METALLURGICAL ENGINEERING,Engineering,856,725,131,0.153037,73000
3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,1258,1123,135,0.107313,70000
4,5,2418,NUCLEAR ENGINEERING,Engineering,2573,2200,373,0.144967,65000


In [70]:
dataframe3.sample(n=3)

Unnamed: 0,Rank,Major_code,Major,Major_category,Total,Men,Women,ShareWomen,Median
32,33,3701,APPLIED MATHEMATICS,Computers & Mathematics,4939,2794,2145,0.434298,45000
46,47,1401,ARCHITECTURE,Engineering,46420,25463,20957,0.451465,40000
22,23,2406,CIVIL ENGINEERING,Engineering,53153,41081,12072,0.227118,50000


`.shape`: Return a tuple representing the dimensionality of the DataFrame.

In [72]:
dataframe3.shape

(76, 9)

In [73]:
df2.shape

(3, 4)

In [3]:
serr = pd.Series([11, 21,13,41,15,61])
serr

0    11
1    21
2    13
3    41
4    15
5    61
dtype: int64

In [4]:
type(serr)

pandas.core.series.Series

In [8]:
series = pd.Series(("Sam", "T", 35, "sam@email.org", 180))

In [9]:
series

0              Sam
1                T
2               35
3    sam@email.org
4              180
dtype: object

In [7]:
type(series)

pandas.core.series.Series

In [16]:
# creating data frame by inserting
dataframe1 = pd.DataFrame([("Sam", "T", 35, "sam@email.org", 180), ("Evan", "D", 31, "evan@email.org", 170 )], columns=["First Name", "Last Initial", "Age", "Email", "Weight"])

In [13]:
dataframe1

Unnamed: 0,First Name,Last Initial,Age,Email,Weight
0,Sam,T,35,sam@email.org,180
1,Evan,D,31,evan@email.org,170


In [17]:
# creating data frame with dictionary
dataframe2 = pd.DataFrame({"First Name": ["Kevin", "Marcus"],
                            "Last Initial": ["T", "W"],
                            "Age": [38, 42],
                            "Email": ["kevin@email.org", "marcus@email.org"],
                            "Weight": [175, 180]})

In [18]:
dataframe2

Unnamed: 0,First Name,Last Initial,Age,Email,Weight
0,Kevin,T,38,kevin@email.org,175
1,Marcus,W,42,marcus@email.org,180


In [22]:
df = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/women-stem.csv')

In [24]:
df

Unnamed: 0,Rank,Major_code,Major,Major_category,Total,Men,Women,ShareWomen,Median
0,1,2419,PETROLEUM ENGINEERING,Engineering,2339,2057,282,0.120564,110000
1,2,2416,MINING AND MINERAL ENGINEERING,Engineering,756,679,77,0.101852,75000
2,3,2415,METALLURGICAL ENGINEERING,Engineering,856,725,131,0.153037,73000
3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,1258,1123,135,0.107313,70000
4,5,2418,NUCLEAR ENGINEERING,Engineering,2573,2200,373,0.144967,65000
...,...,...,...,...,...,...,...,...,...
71,72,3604,ECOLOGY,Biology & Life Science,9154,3878,5276,0.576360,33000
72,73,6109,TREATMENT THERAPY PROFESSIONS,Health,48491,13487,35004,0.721866,33000
73,74,6100,GENERAL MEDICAL AND HEALTH SERVICES,Health,33599,7574,26025,0.774577,32400
74,75,6102,COMMUNICATION DISORDERS SCIENCES AND SERVICES,Health,38279,1225,37054,0.967998,28000


In [25]:
df.shape

(76, 9)

In [26]:
df.head()

Unnamed: 0,Rank,Major_code,Major,Major_category,Total,Men,Women,ShareWomen,Median
0,1,2419,PETROLEUM ENGINEERING,Engineering,2339,2057,282,0.120564,110000
1,2,2416,MINING AND MINERAL ENGINEERING,Engineering,756,679,77,0.101852,75000
2,3,2415,METALLURGICAL ENGINEERING,Engineering,856,725,131,0.153037,73000
3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,1258,1123,135,0.107313,70000
4,5,2418,NUCLEAR ENGINEERING,Engineering,2573,2200,373,0.144967,65000


In [29]:
df.tail(15)

Unnamed: 0,Rank,Major_code,Major,Major_category,Total,Men,Women,ShareWomen,Median
61,62,2001,COMMUNICATION TECHNOLOGIES,Computers & Mathematics,18035,11431,6604,0.366177,35000
62,63,5098,MULTI-DISCIPLINARY OR GENERAL SCIENCE,Physical Sciences,62052,27015,35037,0.564639,35000
63,64,3608,PHYSIOLOGY,Biology & Life Science,22060,8422,13638,0.618223,35000
64,65,3611,NEUROSCIENCE,Biology & Life Science,13663,4944,8719,0.638147,35000
65,66,6103,HEALTH AND MEDICAL ADMINISTRATIVE SERVICES,Health,18109,4266,13843,0.764427,35000
66,67,4002,NUTRITION SCIENCES,Health,18909,2563,16346,0.864456,35000
67,68,6110,COMMUNITY AND PUBLIC HEALTH,Health,19735,4103,15632,0.792095,34000
68,69,3699,MISCELLANEOUS BIOLOGY,Biology & Life Science,10706,4747,5959,0.556604,33500
69,70,6106,HEALTH AND MEDICAL PREPARATORY PROGRAMS,Health,12740,5521,7219,0.566641,33500
70,71,3600,BIOLOGY,Biology & Life Science,280709,111762,168947,0.601858,33400


In [28]:
df.describe()

Unnamed: 0,Rank,Major_code,Total,Men,Women,ShareWomen,Median
count,76.0,76.0,76.0,76.0,76.0,76.0,76.0
mean,38.5,3580.026316,25515.289474,12800.763158,12714.526316,0.436929,46118.421053
std,22.083176,1437.455038,43998.008553,21307.554101,29056.014723,0.232176,13187.223216
min,1.0,1301.0,609.0,488.0,77.0,0.077453,26000.0
25%,19.75,2409.75,3782.0,2047.75,1227.5,0.247918,36150.0
50%,38.5,3601.5,11047.5,4583.0,5217.5,0.405868,44350.0
75%,57.25,5002.25,27509.25,11686.5,12463.5,0.591803,52250.0
max,76.0,6199.0,280709.0,111762.0,187621.0,0.967998,110000.0


## **In-Class (27 Apr 22)**