In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Load the needed datasets
df = pd.read_csv("20180920_sctAnonCopy.csv")
dfCip = pd.read_csv("STVMAJR.csv")

In [3]:
# Limit displayed rows (change if needed)
pd.options.display.max_rows = 80

## A. Basic Data

In [4]:
# Total number of students
# OPT is tallied elsewhere (from SEVIS at time of fall report)

len(df.index)

568

## B. Academic Level

In [5]:
# Class sizes

df['Class'].value_counts()

GR - Graduate Student    380
SO - UG Sophomore         46
SR - UG Senior            44
FR - UG Freshman          43
JR - UG Junior            42
L1 - Law Year 1            9
L3 - Law Year 3            3
L2 - Law Year 2            1
Name: Class, dtype: int64

## C. New/Returning

In [6]:
# New/returning by level

df.groupby(['Level', 'New/Returning']).size().unstack(fill_value=0).reindex(['Undergraduate','Graduate'], axis=0)

New/Returning,New,Returning
Level,Unnamed: 1_level_1,Unnamed: 2_level_1
Undergraduate,48,127
Graduate,105,288


## D. Enrollment Status

In [7]:
# Enrollment time by level

df.groupby(['Level', 'Current Time Status']).size().unstack(fill_value=0).reindex(['Undergraduate','Graduate'], axis=0)

Current Time Status,Full-Time,Half-Time,Less than Half-Time,No hours,"Three-Quarter Time ""Q"""
Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Undergraduate,168,2,0,4,1
Graduate,338,51,1,1,2


## E. Visa Type

In [8]:
# Visa type by level

df.groupby(['Visa Type', 'Level']).size().unstack(fill_value=0).reindex(['Undergraduate','Graduate'], axis=1)

Level,Undergraduate,Graduate
Visa Type,Unnamed: 1_level_1,Unnamed: 2_level_1
E2,1,0
F1,162,329
F2,0,1
FN,0,1
FP,0,3
H1,1,31
H4,1,18
HB,0,2
J1,5,5
JN,4,0


## F. Gender

In [9]:
# Gender by level

df.groupby(['Level', 'Gender']).size().unstack(fill_value=0).reindex(['Undergraduate','Graduate'], axis=0)

Gender,Female,Male
Level,Unnamed: 1_level_1,Unnamed: 2_level_1
Undergraduate,70,105
Graduate,195,198


## G. Marital Status
  
We don't report this because it's not in our report.

## H. Primary source of funds
  
We dont report this either, but I singled out our Fulbrights as "International Organization".

## I. Field of Study

In [10]:
# This extracts the 2nd through 5th character in the string, leaving us with the major code
# Don't run this more than once or it will keep cutting off the front character. Restart the kernel if you make any changes. 

df['majr'] = df['majr'].astype(str).str[1:5]
# df['majr'] # uncomment to check

In [11]:
# This joins the main table to the STVMAJR table and adds a column for their CIP codes. 

mergedDf = df.merge(dfCip, how ='left', on = 'majr')

In [12]:
# This is your list of cip codes grouped by level

mergedDf.groupby(['cip', 'Level']).size().unstack(fill_value=0).reindex(['Undergraduate','Graduate'], axis=1)

Level,Undergraduate,Graduate
cip,Unnamed: 1_level_1,Unnamed: 2_level_1
30104.0,3,0
90101.0,2,0
99999.0,3,0
110103.0,0,86
110401.0,3,0
110701.0,14,0
110802.0,0,93
130401.0,0,12
130406.0,0,3
131013.0,0,1


## J. Place of Origin

In [13]:
# This is your list of citizenship grouped by level

mergedDf.groupby(['Nation of Citizenship', 'Level']).size().unstack(fill_value=0).reindex(['Undergraduate','Graduate'], axis=1)

Level,Undergraduate,Graduate
Nation of Citizenship,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentina,2,0
Australia,1,0
Bangladesh,0,3
Bolivia,1,0
Brazil,4,7
Cambodia,4,0
Cameroon,0,1
Canada,5,8
Chile,0,2
China,40,81


## K. Fields of Study for Top 25 Places of Origin 

In [14]:
# Here we create a df for each country listed and then concat them into a single frame.

bangladesh = mergedDf[mergedDf['Nation of Citizenship'] == 'Bangladesh']
brazil = mergedDf[mergedDf['Nation of Citizenship'] == 'Brazil']
canada = mergedDf[mergedDf['Nation of Citizenship'] == 'Canada']
china = mergedDf[mergedDf['Nation of Citizenship'] == 'China']
colombia = mergedDf[mergedDf['Nation of Citizenship'] == 'Colombia']
france = mergedDf[mergedDf['Nation of Citizenship'] == 'France']
germany = mergedDf[mergedDf['Nation of Citizenship'] == 'Germany']
india = mergedDf[mergedDf['Nation of Citizenship'] == 'India']
indonesia = mergedDf[mergedDf['Nation of Citizenship'] == 'Indonesia']
iran = mergedDf[mergedDf['Nation of Citizenship'] == 'Iran']
japan = mergedDf[mergedDf['Nation of Citizenship'] == 'Japan']
korea = mergedDf[mergedDf['Nation of Citizenship'] == 'Korea, Republic of']
kuwait = mergedDf[mergedDf['Nation of Citizenship'] == 'Kuwait']
malaysia = mergedDf[mergedDf['Nation of Citizenship'] == 'Malaysia']
mexico = mergedDf[mergedDf['Nation of Citizenship'] == 'Mexico']
nepal = mergedDf[mergedDf['Nation of Citizenship'] == 'Nepal']
nigeria = mergedDf[mergedDf['Nation of Citizenship'] == 'Nigeria']
pakistan = mergedDf[mergedDf['Nation of Citizenship'] == 'Pakistan']
saudi = mergedDf[mergedDf['Nation of Citizenship'] == 'Saudi Arabia']
spain = mergedDf[mergedDf['Nation of Citizenship'] == 'Spain']
taiwan = mergedDf[mergedDf['Nation of Citizenship'] == 'Taiwan, Republic of China']
turkey = mergedDf[mergedDf['Nation of Citizenship'] == 'Turkey']
uk = mergedDf[mergedDf['Nation of Citizenship'] == 'United Kingdom']
venezuela = mergedDf[mergedDf['Nation of Citizenship'] == 'Venezuela']
vietnam = mergedDf[mergedDf['Nation of Citizenship'] == 'Viet Nam']

dfTop25A = pd.concat([bangladesh, brazil, canada, china, colombia, france, germany, india, indonesia, iran, japan, korea])
dfTop25B = pd.concat([kuwait, malaysia, mexico, nepal, nigeria, pakistan, saudi, spain, taiwan, turkey, uk, venezuela, vietnam])



In [15]:
dfTop25A.groupby(['cip', 'Nation of Citizenship']).size().unstack(fill_value=0)

Nation of Citizenship,Bangladesh,Brazil,Canada,China,Colombia,France,Germany,India,Indonesia,Japan,"Korea, Republic of"
cip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
30104.0,0,0,0,0,0,0,0,0,0,0,1
99999.0,0,0,0,1,0,0,0,1,0,0,0
110103.0,0,0,1,22,0,0,0,20,0,0,0
110401.0,0,0,0,1,0,0,0,0,0,0,0
110701.0,0,0,0,4,0,0,0,2,0,1,0
110802.0,2,1,0,23,1,0,1,40,0,0,2
130401.0,0,0,0,0,0,0,0,1,0,0,0
130406.0,0,0,0,0,0,0,0,1,0,0,0
131013.0,0,0,0,1,0,0,0,0,0,0,0
131099.0,0,0,0,1,0,0,0,0,0,0,0


In [16]:
dfTop25B.groupby(['cip', 'Nation of Citizenship']).size().unstack(fill_value=0)

Nation of Citizenship,Malaysia,Mexico,Nepal,Nigeria,Pakistan,Saudi Arabia,Spain,"Taiwan, Republic of China",Turkey,United Kingdom,Viet Nam
cip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
90101.0,0,0,0,0,0,2,0,0,0,0,0
99999.0,0,0,0,0,0,1,0,0,0,0,0
110103.0,0,0,7,4,2,12,0,1,0,0,3
110401.0,0,0,0,1,0,1,0,0,0,0,0
110701.0,0,0,0,0,0,2,0,0,0,0,0
110802.0,0,0,5,2,0,0,0,1,0,0,3
130401.0,0,0,0,1,0,7,0,0,0,0,1
130406.0,0,0,0,0,0,0,0,0,0,0,1
131099.0,0,0,0,0,0,1,0,0,0,0,0
131202.0,0,0,0,1,0,0,0,1,0,0,0
