# A Beginner’s Guide to Analyzing Data (Part 1)

Source code from Medium's article: ["A Beginner’s Guide to Analyzing Data (Part 1)"](https://medium.com/better-programming/a-newbies-guide-to-analyzing-data-part-one-33e88a41884e), written by [Charles Shi](https://medium.com/@charlesshi12).

# Importing libraries

In [2]:
import pandas as pd

# Exploring Data Structures in Pandas

## Series (1-Dimensional)

In [4]:
pd. Series(["a", "b", "c", "d", "e"], index=[1, 2, 3, 4, 5])

1    a
2    b
3    c
4    d
5    e
dtype: object

## DataFrames (2-Dimensional)

In [6]:
pd.DataFrame({"blue": [1, 2, 3, 4], "red": [5, 6, 7, 8]}, index=["a", "b", "c", "d"])

Unnamed: 0,blue,red
a,1,5
b,2,6
c,3,7
d,4,8


# Downloading and Reading our Dataset

In [7]:
data = pd.read_csv("sat.csv")

# Viewing and Observing our Data

In [8]:
data

Unnamed: 0,DBN,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean
0,01M292,Henry Street School for International Studies,31.0,391.0,425.0,385.0
1,01M448,University Neighborhood High School,60.0,394.0,419.0,387.0
2,01M450,East Side Community High School,69.0,418.0,431.0,402.0
3,01M458,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0
4,01M509,CMSP HIGH SCHOOL,,,,
...,...,...,...,...,...,...
455,75R025,South Richmond High School,10.0,407.0,421.0,400.0
456,75X012,PS12X LEWIS AND CLARK SCHOOL,,,,
457,75X754,P754 X - Jeffrey M. Rapport School for Career ...,,,,
458,76K460,John Jay High School,9.0,390.0,381.0,398.0


In [9]:
data.head(10)

Unnamed: 0,DBN,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean
0,01M292,Henry Street School for International Studies,31.0,391.0,425.0,385.0
1,01M448,University Neighborhood High School,60.0,394.0,419.0,387.0
2,01M450,East Side Community High School,69.0,418.0,431.0,402.0
3,01M458,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0
4,01M509,CMSP HIGH SCHOOL,,,,
5,01M515,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0
6,01M539,"New Explorations into Sci, Tech and Math HS",47.0,568.0,583.0,568.0
7,01M650,CASCADES HIGH SCHOOL,35.0,411.0,401.0,401.0
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,138.0,630.0,608.0,630.0
9,02M047,AMERICAN SIGN LANG ENG DUAL,11.0,405.0,415.0,385.0


In [13]:
data[data["Number of Test Takers"] >= 100].head()

Unnamed: 0,DBN,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean
5,01M515,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,138.0,630.0,608.0,630.0
22,02M400,HIGH SCHOOL ENVRNMNTL STUDIES,216.0,465.0,480.0,448.0
26,02M412,New York City Laboratory School Collab Studies,108.0,561.0,597.0,567.0
29,02M416,ELEANOR ROOSEVELT HIGH SCHOOL,122.0,555.0,596.0,567.0


In [14]:
data.describe()

Unnamed: 0,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean
count,386.0,386.0,386.0,386.0
mean,103.658031,404.248705,412.935233,397.689119
std,145.264496,56.815631,64.990976,57.762584
min,7.0,291.0,281.0,285.0
25%,35.0,370.0,372.0,364.0
50%,54.0,392.5,394.5,383.0
75%,92.75,419.0,429.75,414.0
max,1047.0,674.0,735.0,678.0


In [15]:
data["Writing Mean"].median()

383.0

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   DBN                    460 non-null    object 
 1   School Name            460 non-null    object 
 2   Number of Test Takers  386 non-null    float64
 3   Critical Reading Mean  386 non-null    float64
 4   Mathematics Mean       386 non-null    float64
 5   Writing Mean           386 non-null    float64
dtypes: float64(4), object(2)
memory usage: 21.7+ KB


In [17]:
data = data.dropna()

In [18]:
data

Unnamed: 0,DBN,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean
0,01M292,Henry Street School for International Studies,31.0,391.0,425.0,385.0
1,01M448,University Neighborhood High School,60.0,394.0,419.0,387.0
2,01M450,East Side Community High School,69.0,418.0,431.0,402.0
3,01M458,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0
5,01M515,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0
...,...,...,...,...,...,...
448,32K554,ALL CITY LEADERSHIP SECONDARY,29.0,394.0,420.0,395.0
449,32K556,Bushwick Leaders High School for Academic Excel,30.0,357.0,345.0,351.0
455,75R025,South Richmond High School,10.0,407.0,421.0,400.0
458,76K460,John Jay High School,9.0,390.0,381.0,398.0


In [19]:
data.describe()

Unnamed: 0,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean
count,386.0,386.0,386.0,386.0
mean,103.658031,404.248705,412.935233,397.689119
std,145.264496,56.815631,64.990976,57.762584
min,7.0,291.0,281.0,285.0
25%,35.0,370.0,372.0,364.0
50%,54.0,392.5,394.5,383.0
75%,92.75,419.0,429.75,414.0
max,1047.0,674.0,735.0,678.0


In [20]:
data.iloc[0]

DBN                                                              01M292
School Name              Henry Street School for International Studies 
Number of Test Takers                                                31
Critical Reading Mean                                               391
Mathematics Mean                                                    425
Writing Mean                                                        385
Name: 0, dtype: object

In [21]:
data.loc[0]

DBN                                                              01M292
School Name              Henry Street School for International Studies 
Number of Test Takers                                                31
Critical Reading Mean                                               391
Mathematics Mean                                                    425
Writing Mean                                                        385
Name: 0, dtype: object

In [22]:
data.sort_values("Writing Mean", ascending=False)

Unnamed: 0,DBN,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean
42,02M475,STUYVESANT HIGH SCHOOL,804.0,674.0,735.0,678.0
185,10X445,BRONX HIGH SCHOOL OF SCIENCE,683.0,632.0,685.0,643.0
384,25Q525,Townsend Harris High School at Queens College,273.0,637.0,644.0,642.0
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,138.0,630.0,608.0,630.0
193,10X696,HS of American Studies at Lehman College,74.0,635.0,630.0,619.0
...,...,...,...,...,...,...
378,25Q263,FLUSHING INTERNATIONAL HIGH SCHOOL,57.0,325.0,415.0,311.0
243,13K616,Brooklyn High Sch for Leadership Community Svc,9.0,323.0,281.0,310.0
170,10X268,Kingsbridge International High School,40.0,313.0,316.0,296.0
224,12X550,HIGH SCHOOL OF WORLD CULTURES,50.0,291.0,333.0,291.0


# Manipulating our DataFrame

In [24]:
data["Total SAT Score Mean"] = data["Critical Reading Mean"] + data["Mathematics Mean"] + \
                               data["Writing Mean"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Total SAT Score Mean"] = data["Critical Reading Mean"] + data["Mathematics Mean"] + \


In [25]:
data

Unnamed: 0,DBN,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean,Total SAT Score Mean
0,01M292,Henry Street School for International Studies,31.0,391.0,425.0,385.0,1201.0
1,01M448,University Neighborhood High School,60.0,394.0,419.0,387.0,1200.0
2,01M450,East Side Community High School,69.0,418.0,431.0,402.0,1251.0
3,01M458,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0,1133.0
5,01M515,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0,1160.0
...,...,...,...,...,...,...,...
448,32K554,ALL CITY LEADERSHIP SECONDARY,29.0,394.0,420.0,395.0,1209.0
449,32K556,Bushwick Leaders High School for Academic Excel,30.0,357.0,345.0,351.0,1053.0
455,75R025,South Richmond High School,10.0,407.0,421.0,400.0,1228.0
458,76K460,John Jay High School,9.0,390.0,381.0,398.0,1169.0


In [26]:
data.sort_values("Total SAT Score Mean", ascending=False).head(10)

Unnamed: 0,DBN,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean,Total SAT Score Mean
42,02M475,STUYVESANT HIGH SCHOOL,804.0,674.0,735.0,678.0,2087.0
185,10X445,BRONX HIGH SCHOOL OF SCIENCE,683.0,632.0,685.0,643.0,1960.0
442,31R605,STATEN ISLAND TECHNICAL HIGH SCHOOL,287.0,638.0,673.0,617.0,1928.0
384,25Q525,Townsend Harris High School at Queens College,273.0,637.0,644.0,642.0,1923.0
193,10X696,HS of American Studies at Lehman College,74.0,635.0,630.0,619.0,1884.0
412,28Q687,QUEENS HS FOR SCIENCE YORK COL,99.0,613.0,650.0,612.0,1875.0
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,138.0,630.0,608.0,630.0,1868.0
235,13K430,BROOKLYN TECHNICAL HIGH SCHOOL,1047.0,588.0,652.0,581.0,1821.0
99,05M692,High School For Math Science Engineering City ...,106.0,592.0,627.0,575.0,1794.0
26,02M412,New York City Laboratory School Collab Studies,108.0,561.0,597.0,567.0,1725.0


In [27]:
data[data["Number of Test Takers"] > 100].sort_values("Total SAT Score Mean", ascending=False).head(10)

Unnamed: 0,DBN,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean,Total SAT Score Mean
42,02M475,STUYVESANT HIGH SCHOOL,804.0,674.0,735.0,678.0,2087.0
185,10X445,BRONX HIGH SCHOOL OF SCIENCE,683.0,632.0,685.0,643.0,1960.0
442,31R605,STATEN ISLAND TECHNICAL HIGH SCHOOL,287.0,638.0,673.0,617.0,1928.0
384,25Q525,Townsend Harris High School at Queens College,273.0,637.0,644.0,642.0,1923.0
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,138.0,630.0,608.0,630.0,1868.0
235,13K430,BROOKLYN TECHNICAL HIGH SCHOOL,1047.0,588.0,652.0,581.0,1821.0
99,05M692,High School For Math Science Engineering City ...,106.0,592.0,627.0,575.0,1794.0
26,02M412,New York City Laboratory School Collab Studies,108.0,561.0,597.0,567.0,1725.0
29,02M416,ELEANOR ROOSEVELT HIGH SCHOOL,122.0,555.0,596.0,567.0,1718.0
76,03M479,BEACON SCHOOL,237.0,573.0,563.0,575.0,1711.0


In [31]:
data.drop(["DBN"], axis=1)

Unnamed: 0,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean,Total SAT Score Mean
0,Henry Street School for International Studies,31.0,391.0,425.0,385.0,1201.0
1,University Neighborhood High School,60.0,394.0,419.0,387.0,1200.0
2,East Side Community High School,69.0,418.0,431.0,402.0,1251.0
3,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0,1133.0
5,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0,1160.0
...,...,...,...,...,...,...
448,ALL CITY LEADERSHIP SECONDARY,29.0,394.0,420.0,395.0,1209.0
449,Bushwick Leaders High School for Academic Excel,30.0,357.0,345.0,351.0,1053.0
455,South Richmond High School,10.0,407.0,421.0,400.0,1228.0
458,John Jay High School,9.0,390.0,381.0,398.0,1169.0


In [32]:
data

Unnamed: 0,DBN,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean,Total SAT Score Mean
0,01M292,Henry Street School for International Studies,31.0,391.0,425.0,385.0,1201.0
1,01M448,University Neighborhood High School,60.0,394.0,419.0,387.0,1200.0
2,01M450,East Side Community High School,69.0,418.0,431.0,402.0,1251.0
3,01M458,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0,1133.0
5,01M515,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0,1160.0
...,...,...,...,...,...,...,...
448,32K554,ALL CITY LEADERSHIP SECONDARY,29.0,394.0,420.0,395.0,1209.0
449,32K556,Bushwick Leaders High School for Academic Excel,30.0,357.0,345.0,351.0,1053.0
455,75R025,South Richmond High School,10.0,407.0,421.0,400.0,1228.0
458,76K460,John Jay High School,9.0,390.0,381.0,398.0,1169.0


In [33]:
del data["DBN"]

In [34]:
data

Unnamed: 0,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean,Total SAT Score Mean
0,Henry Street School for International Studies,31.0,391.0,425.0,385.0,1201.0
1,University Neighborhood High School,60.0,394.0,419.0,387.0,1200.0
2,East Side Community High School,69.0,418.0,431.0,402.0,1251.0
3,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0,1133.0
5,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0,1160.0
...,...,...,...,...,...,...
448,ALL CITY LEADERSHIP SECONDARY,29.0,394.0,420.0,395.0,1209.0
449,Bushwick Leaders High School for Academic Excel,30.0,357.0,345.0,351.0,1053.0
455,South Richmond High School,10.0,407.0,421.0,400.0,1228.0
458,John Jay High School,9.0,390.0,381.0,398.0,1169.0


In [35]:
data = data.drop(data.index[0])

In [36]:
data

Unnamed: 0,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean,Total SAT Score Mean
1,University Neighborhood High School,60.0,394.0,419.0,387.0,1200.0
2,East Side Community High School,69.0,418.0,431.0,402.0,1251.0
3,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0,1133.0
5,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0,1160.0
6,"New Explorations into Sci, Tech and Math HS",47.0,568.0,583.0,568.0,1719.0
...,...,...,...,...,...,...
448,ALL CITY LEADERSHIP SECONDARY,29.0,394.0,420.0,395.0,1209.0
449,Bushwick Leaders High School for Academic Excel,30.0,357.0,345.0,351.0,1053.0
455,South Richmond High School,10.0,407.0,421.0,400.0,1228.0
458,John Jay High School,9.0,390.0,381.0,398.0,1169.0


In [37]:
data.rename(columns={"School Name": "A", "Number of Test Takers": "B"})

Unnamed: 0,A,B,Critical Reading Mean,Mathematics Mean,Writing Mean,Total SAT Score Mean
1,University Neighborhood High School,60.0,394.0,419.0,387.0,1200.0
2,East Side Community High School,69.0,418.0,431.0,402.0,1251.0
3,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0,1133.0
5,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0,1160.0
6,"New Explorations into Sci, Tech and Math HS",47.0,568.0,583.0,568.0,1719.0
...,...,...,...,...,...,...
448,ALL CITY LEADERSHIP SECONDARY,29.0,394.0,420.0,395.0,1209.0
449,Bushwick Leaders High School for Academic Excel,30.0,357.0,345.0,351.0,1053.0
455,South Richmond High School,10.0,407.0,421.0,400.0,1228.0
458,John Jay High School,9.0,390.0,381.0,398.0,1169.0


In [38]:
data

Unnamed: 0,School Name,Number of Test Takers,Critical Reading Mean,Mathematics Mean,Writing Mean,Total SAT Score Mean
1,University Neighborhood High School,60.0,394.0,419.0,387.0,1200.0
2,East Side Community High School,69.0,418.0,431.0,402.0,1251.0
3,SATELLITE ACADEMY FORSYTH ST,26.0,385.0,370.0,378.0,1133.0
5,Lower East Side Preparatory High School,154.0,314.0,532.0,314.0,1160.0
6,"New Explorations into Sci, Tech and Math HS",47.0,568.0,583.0,568.0,1719.0
...,...,...,...,...,...,...
448,ALL CITY LEADERSHIP SECONDARY,29.0,394.0,420.0,395.0,1209.0
449,Bushwick Leaders High School for Academic Excel,30.0,357.0,345.0,351.0,1053.0
455,South Richmond High School,10.0,407.0,421.0,400.0,1228.0
458,John Jay High School,9.0,390.0,381.0,398.0,1169.0
