# Index Alignment

```All Index objects, except for the special
MultiIndex, are single-dimensional data structures that combine the functionality and
implementation of Python sets and NumPy ndarrays.```

In [1]:
url = "https://raw.githubusercontent.com/PacktPublishing/Pandas-Cookbook/master/data/college.csv"
college = pd.read_csv(url)

In [2]:
columns = college.columns
columns

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL',
       'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS', 'UGDS_WHITE',
       'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NHPI',
       'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF', 'CURROPER', 'PCTPELL',
       'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP'],
      dtype='object')

In [3]:
#Use the values attribute to access the underlying NumPy array:
columns.values

array(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY',
       'RELAFFIL', 'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS',
       'UGDS_WHITE', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN',
       'UGDS_NHPI', 'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF',
       'CURROPER', 'PCTPELL', 'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10',
       'GRAD_DEBT_MDN_SUPP'], dtype=object)

In [4]:
#Select items from the index by integer location with scalars, lists, or slices:
print(columns[5])
print(columns[[1, 7, 10]])
print(columns[-7:-4])

WOMENONLY
Index(['CITY', 'SATVRMID', 'UGDS'], dtype='object')
Index(['PPTUG_EF', 'CURROPER', 'PCTPELL'], dtype='object')


In [5]:
#Indexes share many of the same methods as Series and DataFrames:
columns.min(), columns.max(), columns.isnull().sum()

('CITY', 'WOMENONLY', 0)

In [6]:
#Use basic arithmetic and comparison operators directly on Index objects
columns + '_A'

Index(['INSTNM_A', 'CITY_A', 'STABBR_A', 'HBCU_A', 'MENONLY_A', 'WOMENONLY_A',
       'RELAFFIL_A', 'SATVRMID_A', 'SATMTMID_A', 'DISTANCEONLY_A', 'UGDS_A',
       'UGDS_WHITE_A', 'UGDS_BLACK_A', 'UGDS_HISP_A', 'UGDS_ASIAN_A',
       'UGDS_AIAN_A', 'UGDS_NHPI_A', 'UGDS_2MOR_A', 'UGDS_NRA_A',
       'UGDS_UNKN_A', 'PPTUG_EF_A', 'CURROPER_A', 'PCTPELL_A', 'PCTFLOAN_A',
       'UG25ABV_A', 'MD_EARN_WNE_P10_A', 'GRAD_DEBT_MDN_SUPP_A'],
      dtype='object')

#### <span style="color:orange"> ***Trying to change an Index value directly after its creation fails. Indexes are immutable objects*** </span>

In [17]:
columns[1] = 'city' 

In [5]:
# Indexes support the set operations, union, intersection, difference, and symmetric_difference:
c1 = columns[:4]
c2 = columns[2:6]

print(c1.union(c2))
print(c1.intersection(c2))
print(c1.symmetric_difference(c2)) #los que estan en c1 pero no en c2


Index(['CITY', 'HBCU', 'INSTNM', 'MENONLY', 'STABBR', 'WOMENONLY'], dtype='object')
Index(['STABBR', 'HBCU'], dtype='object')
Index(['CITY', 'INSTNM', 'MENONLY', 'WOMENONLY'], dtype='object')


## Producing Cartesian products

In [9]:
s1 = pd.Series(index=list('aaab'), data = np.arange(4))
s1

a    0
a    1
a    2
b    3
dtype: int32

In [10]:
s2 = pd.Series(index=list('cababb'), data=np.arange(6))
s2

c    0
a    1
b    2
a    3
b    4
b    5
dtype: int32

In [11]:
#produce the cartesian product (matrices)
s1 + s2 #10*2 table
#Pandas defaults to a missing value whenever an index label is unique to one object.

a    1.0
a    3.0
a    2.0
a    4.0
a    3.0
a    5.0
b    5.0
b    7.0
b    8.0
c    NaN
dtype: float64

```An exception to the preceding example takes place when the indexes contain the same exact
elements in the same order.```

```When this occurs, a Cartesian product does not take place, and
the indexes instead align by their position.```

In [12]:
s1 = pd.Series(index= list('aaabb'), data = np.arange(5))
s2 = pd.Series(index= list('aaabb'), data = np.arange(5))

s1+s2 #No cartesian product, just a normal sum

a    0
a    2
a    4
b    6
b    8
dtype: int32

In [19]:
#If the elements of the index are identical, but the order is different between the Series
#-----------Cartesian product occurs.
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('bbaaa'), data=np.arange(5))
s1+s2

a    2
a    3
a    4
a    3
a    4
a    5
a    4
a    5
a    6
b    3
b    4
b    4
b    5
dtype: int32

In [20]:
s1

a    0
a    1
a    2
b    3
b    4
dtype: int32

In [21]:
s2

b    0
b    1
a    2
a    3
a    4
dtype: int32

### ***Typically, array-like data structures in Python and other languages do not allow operations to take place when the operating dimensions do not contain the same number of elements. Pandas allows this to happen by aligning the indexes first before completing the operation.****

# Exploding indexes

In [22]:
#The last example was small. What happens when we have a big database?
url = "https://raw.githubusercontent.com/PacktPublishing/Pandas-Cookbook/master/data/employee.csv"
employee = pd.read_csv(url, index_col='RACE')
employee.head()

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Hispanic/Latino,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Full Time,Female,Active,2006-06-12,2012-10-13
Hispanic/Latino,1,LIBRARY ASSISTANT,Library,26125.0,Full Time,Female,Active,2000-07-19,2010-09-18
White,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,Full Time,Male,Active,2015-02-03,2015-02-03
White,3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,Full Time,Male,Active,1982-02-08,1991-05-25
White,4,ELECTRICIAN,General Services Department,56347.0,Full Time,Male,Active,1989-06-19,1994-10-22


In [23]:
salary1 = employee['BASE_SALARY'].copy() #if we dont assign it to a variable we just get a VIEW object
salary2 = employee['BASE_SALARY'].copy() #a view is just a... preview result, but not really applied

In [24]:
#Let's change the order of the index for one of the Series by sorting it:
salary1 = salary1.sort_index()
salary1.head()

RACE
American Indian or Alaskan Native    78355.0
American Indian or Alaskan Native    26125.0
American Indian or Alaskan Native    98536.0
American Indian or Alaskan Native        NaN
American Indian or Alaskan Native    55461.0
Name: BASE_SALARY, dtype: float64

In [25]:
salary2.head()

RACE
Hispanic/Latino    121862.0
Hispanic/Latino     26125.0
White               45279.0
White               63166.0
White               56347.0
Name: BASE_SALARY, dtype: float64

In [26]:
#let's make the cartesian product
salary_add = salary1 + salary2
salary_add

RACE
American Indian or Alaskan Native    138702.0
American Indian or Alaskan Native    156710.0
American Indian or Alaskan Native    176891.0
American Indian or Alaskan Native    159594.0
American Indian or Alaskan Native    127734.0
                                       ...   
NaN                                   68024.0
NaN                                   56048.0
NaN                                   56790.0
NaN                                       NaN
NaN                                   56048.0
Name: BASE_SALARY, Length: 1175424, dtype: float64

In [27]:
#And check the length of them

len(salary1), len(salary2), len(salary_add)
# se crean mas columnas de las que deberian. todo porque ordenamos un indice y el otro no

(2000, 2000, 1175424)

In [31]:
index_vc = salary1.index.value_counts(dropna=False)
index_vc

Black or African American            700
White                                665
Hispanic/Latino                      480
Asian/Pacific Islander               107
NaN                                   35
American Indian or Alaskan Native     11
Others                                 2
Name: RACE, dtype: int64

In [41]:
a = salary1[salary1.index.isnull()] #filter by nulls
a.sum() #See the total of values to seee if it's good idea to drop them

1119915.0

# Filling values with unequal indexes

```When two Series are added together using the plus operator and one of the index labels
does not appear in the other, the resulting value is always missing.```

```Pandas offers the ***add*** method, which provides an option to fill the missing value.```

In [42]:
url1 = "https://raw.githubusercontent.com/PacktPublishing/Pandas-Cookbook/master/data/baseball14.csv"
url2 = "https://raw.githubusercontent.com/PacktPublishing/Pandas-Cookbook/master/data/baseball15.csv"
url3 = "https://raw.githubusercontent.com/PacktPublishing/Pandas-Cookbook/master/data/baseball16.csv"

baseball_14 = pd.read_csv(url1, index_col='playerID')
baseball_15 = pd.read_csv(url2, index_col='playerID')
baseball_16 = pd.read_csv(url3, index_col='playerID')

In [43]:
baseball_14.head()

Unnamed: 0_level_0,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
altuvjo01,2014,1,HOU,AL,158,660,85,225,47,3,...,59.0,56.0,9.0,36,53.0,7.0,5.0,1.0,5.0,20.0
cartech02,2014,1,HOU,AL,145,507,68,115,21,1,...,88.0,5.0,2.0,56,182.0,6.0,5.0,0.0,4.0,12.0
castrja01,2014,1,HOU,AL,126,465,43,103,21,2,...,56.0,1.0,0.0,34,151.0,1.0,9.0,1.0,3.0,11.0
corpoca01,2014,1,HOU,AL,55,170,22,40,6,0,...,19.0,0.0,0.0,14,37.0,0.0,3.0,1.0,2.0,3.0
dominma01,2014,1,HOU,AL,157,564,51,121,17,0,...,57.0,0.0,1.0,29,125.0,2.0,5.0,2.0,7.0,23.0


In [44]:
#Use the index method difference to discover which index labels are in 
#baseball_14 and not in baseball_15, and vice versa:

print(baseball_14.index.difference(baseball_15.index))
print(baseball_14.index.difference(baseball_16.index))

Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
       'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
      dtype='object', name='playerID')
Index(['cartech02', 'corpoca01', 'dominma01', 'fowlede01', 'grossro01',
       'guzmaje01', 'hoeslj01', 'krausma01', 'preslal01', 'singljo02',
       'villajo01'],
      dtype='object', name='playerID')


## Highlight NAN values

When two Series are added together using the plus operator and one of the index labels
does not appear in the other, the resulting value is always missing. Pandas offers the add
method, which provides an option to fill the missing value.

In [45]:
df_14 = baseball_14[['G','AB', 'R', 'H']]
df_15 = baseball_15[['AB', 'R', 'H', 'HR']]
df_14.add(df_15, fill_value=0).head(10).style.highlight_null('green') #Add gives the missing value option

Unnamed: 0_level_0,AB,G,H,HR,R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,1298.0,158.0,425.0,15.0,171.0
cartech02,898.0,145.0,193.0,24.0,118.0
castrja01,802.0,126.0,174.0,11.0,81.0
congeha01,201.0,,46.0,11.0,25.0
corpoca01,170.0,55.0,40.0,,22.0
correca01,387.0,,108.0,22.0,52.0
dominma01,564.0,157.0,121.0,,51.0
fowlede01,434.0,116.0,120.0,,61.0
gattiev01,566.0,,139.0,27.0,66.0
gomezca01,149.0,,36.0,4.0,19.0


## Appending columns from different DataFrames

In [46]:
dept_salary = employee[['DEPARTMENT', 'BASE_SALARY']]

In [47]:
dept_sal = dept_salary.sort_values(['DEPARTMENT', 'BASE_SALARY'], ascending = [True, False])
dept_sal

Unnamed: 0_level_0,DEPARTMENT,BASE_SALARY
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1
White,Admn. & Regulatory Affairs,140416.0
Asian/Pacific Islander,Admn. & Regulatory Affairs,130416.0
White,Admn. & Regulatory Affairs,103776.0
Black or African American,Admn. & Regulatory Affairs,72741.0
Black or African American,Admn. & Regulatory Affairs,66825.0
...,...,...
,Solid Waste Management,30410.0
,Solid Waste Management,30410.0
Black or African American,Solid Waste Management,28829.0
Black or African American,Solid Waste Management,27622.0


In [48]:
max_dept_sal = dept_sal.drop_duplicates(subset='DEPARTMENT')
max_dept_sal.head()

Unnamed: 0_level_0,DEPARTMENT,BASE_SALARY
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1
White,Admn. & Regulatory Affairs,140416.0
Hispanic/Latino,City Controller's Office,64251.0
Black or African American,City Council,100000.0
Hispanic/Latino,Convention and Entertainment,38397.0
Black or African American,Dept of Neighborhoods (DON),89221.0


In [49]:
# For appending, we need same indexes (REMEMBER: indexes are the first thing that is checked) como SQL
max_dept_sal = max_dept_sal.set_index('DEPARTMENT')
employee = employee.set_index('DEPARTMENT')

In [63]:
max_dept_sal.head()

Unnamed: 0_level_0,BASE_SALARY
DEPARTMENT,Unnamed: 1_level_1
Admn. & Regulatory Affairs,140416.0
City Controller's Office,64251.0
City Council,100000.0
Convention and Entertainment,38397.0
Dept of Neighborhoods (DON),89221.0


In [55]:
max_dept_sal.shape

(24, 1)

In [64]:
employee.head()

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,BASE_SALARY,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Municipal Courts Department,0,ASSISTANT DIRECTOR (EX LVL),121862.0,Full Time,Female,Active,2006-06-12,2012-10-13,121862.0
Library,1,LIBRARY ASSISTANT,26125.0,Full Time,Female,Active,2000-07-19,2010-09-18,107763.0
Houston Police Department-HPD,2,POLICE OFFICER,45279.0,Full Time,Male,Active,2015-02-03,2015-02-03,199596.0
Houston Fire Department (HFD),3,ENGINEER/OPERATOR,63166.0,Full Time,Male,Active,1982-02-08,1991-05-25,210588.0
General Services Department,4,ELECTRICIAN,56347.0,Full Time,Male,Active,1989-06-19,1994-10-22,89194.0


In [56]:
employee.shape

(2000, 8)

In [57]:
employee['MAX_DEPT_SALARY'] = max_dept_sal['BASE_SALARY']

In [62]:
employee.isnull().sum()

UNIQUE_ID              0
POSITION_TITLE         0
BASE_SALARY          114
EMPLOYMENT_TYPE        0
GENDER                 0
EMPLOYMENT_STATUS      0
HIRE_DATE              0
JOB_DATE               3
MAX_DEPT_SALARY        0
dtype: int64

## Highlighting the maximum value from each column

In [50]:
college.head(2)

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5


In [52]:
#check the dtypes
college.dtypes

CITY                   object
STABBR                 object
HBCU                  float64
MENONLY               float64
WOMENONLY             float64
RELAFFIL                int64
SATVRMID              float64
SATMTMID              float64
DISTANCEONLY          float64
UGDS                  float64
UGDS_WHITE            float64
UGDS_BLACK            float64
UGDS_HISP             float64
UGDS_ASIAN            float64
UGDS_AIAN             float64
UGDS_NHPI             float64
UGDS_2MOR             float64
UGDS_NRA              float64
UGDS_UNKN             float64
PPTUG_EF              float64
CURROPER                int64
PCTPELL               float64
PCTFLOAN              float64
UG25ABV               float64
MD_EARN_WNE_P10        object
GRAD_DEBT_MDN_SUPP     object
dtype: object

In [61]:
#We have two objects. We can't really take a max value out of these.let's take a look at how they are constituted
print(college[['MD_EARN_WNE_P10']].head()) #but both of them have numbers...
print(college[['GRAD_DEBT_MDN_SUPP']].head())

                                    MD_EARN_WNE_P10
INSTNM                                             
Alabama A & M University                      30300
University of Alabama at Birmingham           39700
Amridge University                            40100
University of Alabama in Huntsville           45500
Alabama State University                      26600
                                    GRAD_DEBT_MDN_SUPP
INSTNM                                                
Alabama A & M University                         33888
University of Alabama at Birmingham            21941.5
Amridge University                               23370
University of Alabama in Huntsville              24097
Alabama State University                       33118.5


In [60]:
#What is making them objects? we can check with sort_values
print(college[['MD_EARN_WNE_P10']].sort_values('MD_EARN_WNE_P10', ascending= False).head())
#we have a "Privacy surpressed" values. Some universities didn't want to give info. Los culeros. 

                                                      MD_EARN_WNE_P10
INSTNM                                                               
Sharon Regional Health System School of Nursing     PrivacySuppressed
P&A Scholars Beauty School                          PrivacySuppressed
Fairview Beauty Academy                             PrivacySuppressed
Rabbi Jacob Joseph School                           PrivacySuppressed
Acupuncture and Integrative Medicine College-Be...  PrivacySuppressed


In [62]:
print(college[['GRAD_DEBT_MDN_SUPP']].sort_values('GRAD_DEBT_MDN_SUPP', ascending= False).head())

                                   GRAD_DEBT_MDN_SUPP
INSTNM                                               
Westminster Theological Seminary    PrivacySuppressed
Dallas Theological Seminary         PrivacySuppressed
Baldwin Beauty School-South Austin  PrivacySuppressed
Baylor College of Medicine          PrivacySuppressed
Coastal Bend College                PrivacySuppressed


In [None]:
#But we want those numbers. For that, we can convert the columns to numeric using the pandas function:
#----------to_numeric()---------(for viceversa, numeric to object/string, use )

In [77]:
cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
for col in cols:
    college[col] = pd.to_numeric(college[col], errors='coerce')

```Nota = astype convierte al tipo de dato que queremos, aquellos valores que se puedan
pero si encuentra un nan o otro tipo de dato que no puede ser convertido, tira error. EN CAMBIO p.to_numeric NOS DA OPCION DE HACER ALGO CON ESOS ERRORES```

In [79]:
college.dtypes. #check to make sure they were converted

CITY                   object
STABBR                 object
HBCU                  float64
MENONLY               float64
WOMENONLY             float64
RELAFFIL                int64
SATVRMID              float64
SATMTMID              float64
DISTANCEONLY          float64
UGDS                  float64
UGDS_WHITE            float64
UGDS_BLACK            float64
UGDS_HISP             float64
UGDS_ASIAN            float64
UGDS_AIAN             float64
UGDS_NHPI             float64
UGDS_2MOR             float64
UGDS_NRA              float64
UGDS_UNKN             float64
PPTUG_EF              float64
CURROPER                int64
PCTPELL               float64
PCTFLOAN              float64
UG25ABV               float64
MD_EARN_WNE_P10       float64
GRAD_DEBT_MDN_SUPP    float64
dtype: object

In [66]:
#Use the select_dtypes method to filter for only numeric columns.
college_n = college.select_dtypes(include=[np.number])
college_n.head()

wow,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,UGDS_WHITE,UGDS_BLACK,...,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV
0,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,0.0333,0.9353,...,0.0024,0.0019,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049
1,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,0.5922,0.26,...,0.0022,0.0007,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422
2,0.0,0.0,0.0,1,,,1.0,291.0,0.299,0.4192,...,0.0,0.0,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854
3,0.0,0.0,0.0,0,595.0,590.0,0.0,5451.0,0.6988,0.1255,...,0.0143,0.0002,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264
4,1.0,0.0,0.0,0,425.0,430.0,0.0,4811.0,0.0158,0.9208,...,0.001,0.0006,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127


```By utilizing the data dictionary, there are several columns that have only binary
(0/1) values that will not provide useful information.```

```To programmatically find these columns, we can create boolean Series and find all the columns that have
two unique values with the ***nunique*** method:```

In [67]:
criteria = college_n.nunique() == 2 #find collumns with only two unique values (wich is )
criteria #this generates a Series

wow
HBCU             True
MENONLY          True
WOMENONLY        True
RELAFFIL         True
SATVRMID        False
SATMTMID        False
DISTANCEONLY     True
UGDS            False
UGDS_WHITE      False
UGDS_BLACK      False
UGDS_HISP       False
UGDS_ASIAN      False
UGDS_AIAN       False
UGDS_NHPI       False
UGDS_2MOR       False
UGDS_NRA        False
UGDS_UNKN       False
PPTUG_EF        False
CURROPER         True
PCTPELL         False
PCTFLOAN        False
UG25ABV         False
dtype: bool

In [68]:
#Now that we have this criteria to identify the dicothomy variables. 
#We filter the true values through the college_n and pass them to a list
binary_cols = college_n.columns[criteria]
binary_cols

Index(['HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL', 'DISTANCEONLY', 'CURROPER'], dtype='object', name='wow')

In [69]:
#For droping this values, we just pass this list as the condition for the DROP statement
college_n2 = college_n.drop(labels = binary_cols, axis= 'columns')
college_n2

wow,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV
0,424.0,420.0,4206.0,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0000,0.0059,0.0138,0.0656,0.7356,0.8284,0.1049
1,570.0,565.0,11383.0,0.5922,0.2600,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.0100,0.2607,0.3460,0.5214,0.2422
2,,,291.0,0.2990,0.4192,0.0069,0.0034,0.0000,0.0000,0.0000,0.0000,0.2715,0.4536,0.6801,0.7795,0.8540
3,595.0,590.0,5451.0,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.0350,0.2146,0.3072,0.4596,0.2640
4,425.0,430.0,4811.0,0.0158,0.9208,0.0121,0.0019,0.0010,0.0006,0.0098,0.0243,0.0137,0.0892,0.7347,0.7554,0.1270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7530,,,,,,,,,,,,,,,,
7531,,,,,,,,,,,,,,,,
7532,,,,,,,,,,,,,,,,
7533,,,,,,,,,,,,,,,,


In [97]:
#Use the idxmax method to find the index label of the maximum value for each column:
max_cols = college_n2.idxmax()
max_cols

SATVRMID                             California Institute of Technology
SATMTMID                             California Institute of Technology
UGDS                                      University of Phoenix-Arizona
UGDS_WHITE                       Mr Leon's School of Hair Design-Moscow
UGDS_BLACK                           Velvatex College of Beauty Culture
UGDS_HISP                       Thunderbird School of Global Management
UGDS_ASIAN                          Cosmopolitan Beauty and Tech School
UGDS_AIAN                             Haskell Indian Nations University
UGDS_NHPI                                       Palau Community College
UGDS_2MOR                                                 LIU Brentwood
UGDS_NRA               California University of Management and Sciences
UGDS_UNKN             Le Cordon Bleu College of Culinary Arts-San Fr...
PPTUG_EF                        Thunderbird School of Global Management
PCTPELL                                        MTI Business Coll

In [98]:
#Call the unique method on the max_cols Series. This returns an ndarray of the unique column names:
unique_max_cols = max_cols.unique()
unique_max_cols[:5] #solo visualizar cuales pueden ser

array(['California Institute of Technology',
       'University of Phoenix-Arizona',
       "Mr Leon's School of Hair Design-Moscow",
       'Velvatex College of Beauty Culture',
       'Thunderbird School of Global Management'], dtype=object)

In [99]:
college_n2.loc[unique_max_cols].style.highlight_max()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
California Institute of Technology,765.0,785.0,983.0,0.2787,0.0153,0.1221,0.4385,0.001,0.0,0.057,0.0875,0.0,0.0,0.1126,0.2303,0.0082,77800.0,11812.5
University of Phoenix-Arizona,,,151558.0,0.3098,0.1555,0.076,0.0082,0.0042,0.005,0.1131,0.0131,0.3152,0.0,0.6009,0.592,,,33000.0
Mr Leon's School of Hair Design-Moscow,,,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.625,0.2,,15710.0
Velvatex College of Beauty Culture,,,25.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.7692,0.0,0.52,,
Thunderbird School of Global Management,,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,118900.0,
Cosmopolitan Beauty and Tech School,,,110.0,0.0091,0.0,0.0182,0.9727,0.0,0.0,0.0,0.0,0.0,0.3182,0.7761,0.1244,0.9545,,
Haskell Indian Nations University,430.0,440.0,805.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0224,0.8396,0.0,0.2089,22800.0,
Palau Community College,,,602.0,0.0,0.0017,0.0,0.0,0.0,0.9983,0.0,0.0,0.0,0.3887,0.856,0.0,0.2616,24700.0,
LIU Brentwood,,,15.0,0.0,0.1333,0.2667,0.0,0.0,0.0,0.5333,0.0,0.0667,0.4,0.5652,0.7826,0.7826,44600.0,25499.0
California University of Management and Sciences,,,98.0,0.0102,0.0204,0.0,0.0408,0.0,0.0,0.0,0.9286,0.0,0.0,0.0926,0.0556,0.6852,,
