### <font color="brown">Pandas - DataFrame Continued</font>

In [2]:
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame

---

#### <font color="brown">Working with NaNs</font>

#### Remove rows/columns with NaNs using dropna

In [3]:
from numpy import nan as NA
datf = DataFrame([[1, 3.8, 2.1],
                  [2, NA, NA],
                  [NA, NA, NA],
                  [NA, 4.8, 1.7]])
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


**Drop rows that have an NaN in any column**

In [6]:
datf.dropna()

Unnamed: 0,0,1,2
0,1.0,3.8,2.1


In [7]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


In [8]:
datf1 = datf.copy()
datf1.dropna(inplace=True)
datf1

Unnamed: 0,0,1,2
0,1.0,3.8,2.1


In [9]:
# to do the same with columns, pass axis=1
datf.dropna(axis=1)

0
1
2
3


**To drop only those rows/columns that have NaN in ALL columns**

In [12]:
# drop rows that have NaNs in all columns
datf.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
3,,4.8,1.7


In [13]:
# drop columns that have NaNs in all rows
datf.dropna(how='all',axis=1)  

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


#### Filling NaNs with values

**Replace all NaNs with single value**

In [14]:
datf

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


In [15]:
datf.fillna(0)

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,4.8,1.7


**Replace all NaNs in row or column using ffill (forward fill)**

In [17]:
# column-wise
datf.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,3.8,2.1
2,2.0,3.8,2.1
3,2.0,4.8,1.7


In [18]:
datf   # original not modified

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,,,
3,,4.8,1.7


In [19]:
# row-wise
datf.fillna(method='ffill',axis=1)

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,2.0,2.0
2,,,
3,,4.8,1.7


**Replace all NaNs in multiple columns using dictionary**

In [21]:
datf.fillna({1: 2.5, 2: 1.5})

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,2.5,1.5
2,,2.5,1.5
3,,4.8,1.7


In [None]:
datf

**Treat column/row separately as Series, and use fillna**

In [22]:
# column as Series and fillna
datf[2].fillna(1.5)

0    2.1
1    1.5
2    1.5
3    1.7
Name: 2, dtype: float64

In [23]:
# row as Series, and fillna inplace
datfc = datf.copy()
datfc.loc[2].fillna(-1,inplace=True)
datfc

Unnamed: 0,0,1,2
0,1.0,3.8,2.1
1,2.0,,
2,-1.0,-1.0,-1.0
3,,4.8,1.7


---
#### <font color="brown">One way to deal with missing numeric data is to replace with mean</font>

In [24]:
mpgfile = open("auto_mpg_original.csv")
mpgs = pd.read_csv(mpgfile)
mpgs

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino
...,...,...,...,...,...,...,...,...,...
401,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,1.0,ford mustang gl
402,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,2.0,vw pickup
403,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,1.0,dodge rampage
404,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,1.0,ford ranger


In [25]:
mpgs['mpg'].mean()

23.514572864321615

In [26]:
mpgs2 = mpgs.copy()

In [27]:
mpgs2[mpgs2['mpg'].isnull()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
10,,4.0,133.0,115.0,3090.0,17.5,70.0,2.0,citroen ds-21 pallas
11,,8.0,350.0,165.0,4142.0,11.5,70.0,1.0,chevrolet chevelle concours (sw)
12,,8.0,351.0,153.0,4034.0,11.0,70.0,1.0,ford torino (sw)
13,,8.0,383.0,175.0,4166.0,10.5,70.0,1.0,plymouth satellite (sw)
14,,8.0,360.0,175.0,3850.0,11.0,70.0,1.0,amc rebel sst (sw)
17,,8.0,302.0,140.0,3353.0,8.0,70.0,1.0,ford mustang boss 302
39,,4.0,97.0,48.0,1978.0,20.0,71.0,2.0,volkswagen super beetle 117
367,,4.0,121.0,110.0,2800.0,15.4,81.0,2.0,saab 900s


##### **Use fillna method on relevant column (Series)**

In [32]:
mpgs2['mpg'] = mpgs2['mpg'].fillna(mpgs2['mpg'].mean())

In [35]:
mpgs2.loc[10:14]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
10,23.514573,4.0,133.0,115.0,3090.0,17.5,70.0,2.0,citroen ds-21 pallas
11,23.514573,8.0,350.0,165.0,4142.0,11.5,70.0,1.0,chevrolet chevelle concours (sw)
12,23.514573,8.0,351.0,153.0,4034.0,11.0,70.0,1.0,ford torino (sw)
13,23.514573,8.0,383.0,175.0,4166.0,10.5,70.0,1.0,plymouth satellite (sw)
14,23.514573,8.0,360.0,175.0,3850.0,11.0,70.0,1.0,amc rebel sst (sw)


In [36]:
round(mpgs2.loc[10:14])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
10,24.0,4.0,133.0,115.0,3090.0,18.0,70.0,2.0,citroen ds-21 pallas
11,24.0,8.0,350.0,165.0,4142.0,12.0,70.0,1.0,chevrolet chevelle concours (sw)
12,24.0,8.0,351.0,153.0,4034.0,11.0,70.0,1.0,ford torino (sw)
13,24.0,8.0,383.0,175.0,4166.0,10.0,70.0,1.0,plymouth satellite (sw)
14,24.0,8.0,360.0,175.0,3850.0,11.0,70.0,1.0,amc rebel sst (sw)


In [37]:
mpgs[mpgs['horsepower'].isnull()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
38,25.0,4.0,98.0,,2046.0,19.0,71.0,1.0,ford pinto
133,21.0,6.0,200.0,,2875.0,17.0,74.0,1.0,ford maverick
337,40.9,4.0,85.0,,1835.0,17.3,80.0,2.0,renault lecar deluxe
343,23.6,4.0,140.0,,2905.0,14.3,80.0,1.0,ford mustang cobra
361,34.5,4.0,100.0,,2320.0,15.8,81.0,2.0,renault 18i
382,23.0,4.0,151.0,,3035.0,20.5,82.0,1.0,amc concord dl


In [38]:
mpgs2['horsepower'] = mpgs2['horsepower'].fillna(mpgs2['horsepower'].mean())

In [39]:
mpgs2.loc[[38,133]]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
38,25.0,4.0,98.0,105.0825,2046.0,19.0,71.0,1.0,ford pinto
133,21.0,6.0,200.0,105.0825,2875.0,17.0,74.0,1.0,ford maverick


In [40]:
mpgs2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           406 non-null    float64
 1   cylinders     406 non-null    float64
 2   displacement  406 non-null    float64
 3   horsepower    406 non-null    float64
 4   weight        406 non-null    float64
 5   acceleration  406 non-null    float64
 6   model year    406 non-null    float64
 7   origin        406 non-null    float64
 8   car name      406 non-null    object 
dtypes: float64(8), object(1)
memory usage: 28.7+ KB


---

#### <font color="brown">General data frame manipulation</font>

**Column returned on indexing is a VIEW, so modifications will affect underlying dataframe column**

In [41]:
rand2d = np.random.random((3,2))
rand2d

array([[0.59988126, 0.17602563],
       [0.92173942, 0.40156769],
       [0.19160958, 0.07292544]])

In [42]:
randdf = DataFrame(rand2d, index=['one', 'two', 'three'],
                   columns = ['first', 'second'])
randdf

Unnamed: 0,first,second
one,0.599881,0.176026
two,0.921739,0.401568
three,0.19161,0.072925


In [43]:
col2 = randdf['second']
col2

one      0.176026
two      0.401568
three    0.072925
Name: second, dtype: float64

In [44]:
col2 += 0.05
print(col2,'\n')
print(randdf)

one      0.226026
two      0.451568
three    0.122925
Name: second, dtype: float64 

          first    second
one    0.599881  0.226026
two    0.921739  0.451568
three  0.191610  0.122925


In [45]:
# if you don't want this, make an explicit copy of the returned column series
randdf['second'] -= 0.05
randdf

Unnamed: 0,first,second
one,0.599881,0.176026
two,0.921739,0.401568
three,0.19161,0.072925


In [46]:
col2 = randdf['second'].copy()
col2 += 0.05
print(col2,'\n')
print(randdf)

one      0.226026
two      0.451568
three    0.122925
Name: second, dtype: float64 

          first    second
one    0.599881  0.176026
two    0.921739  0.401568
three  0.191610  0.072925


#### Adding dataframes together

In [47]:
rand2d = np.random.random((3,3))
randdf2 = DataFrame(rand2d, index=['one', 'two', 'four'],
                   columns = ['first', 'second', 'third'])
randdf2

Unnamed: 0,first,second,third
one,0.80313,0.042951,0.31275
two,0.86506,0.080937,0.452442
four,0.610581,0.207287,0.314254


In [48]:
randdf

Unnamed: 0,first,second
one,0.599881,0.176026
two,0.921739,0.401568
three,0.19161,0.072925


In [49]:
randdf + randdf2   # NaN will be used if either of a pair of aligned values is missing

Unnamed: 0,first,second,third
four,,,
one,1.403011,0.218976,
three,,,
two,1.7868,0.482504,


---

#### <font color="brown">Function Application and Mapping</font>

##### <font color="brown">apply, for Series objects of DataFrame (columns or rows)</font>

In [50]:
df = DataFrame(np.random.randn(4,3),columns=list("ABC"),index=["One","Two","Three",'Four'])
df

Unnamed: 0,A,B,C
One,0.561319,-0.635798,-0.125204
Two,-1.006137,-2.450083,0.390894
Three,1.861529,-1.133596,0.245603
Four,0.222871,1.68952,-0.418951


In [51]:
dfabs = df.abs()
dfabs

Unnamed: 0,A,B,C
One,0.561319,0.635798,0.125204
Two,1.006137,2.450083,0.390894
Three,1.861529,1.133596,0.245603
Four,0.222871,1.68952,0.418951


**Function that rounds each item of a series that will be sent as parameter**

In [52]:
roundfn = lambda x: round(x,2)  # x is a Series

**Use apply method on dataframe with function as parameter, each column will be sent as argument to function**

In [53]:
dfabs.apply(roundfn)  # each column is sent in as an argument

Unnamed: 0,A,B,C
One,0.56,0.64,0.13
Two,1.01,2.45,0.39
Three,1.86,1.13,0.25
Four,0.22,1.69,0.42


In [54]:
dfabs  # original does not change

Unnamed: 0,A,B,C
One,0.561319,0.635798,0.125204
Two,1.006137,2.450083,0.390894
Three,1.861529,1.133596,0.245603
Four,0.222871,1.68952,0.418951


In [55]:
# or you can directly write the lambda as argument
dfabs.apply(lambda x: round(x,2))

Unnamed: 0,A,B,C
One,0.56,0.64,0.13
Two,1.01,2.45,0.39
Three,1.86,1.13,0.25
Four,0.22,1.69,0.42


In [56]:
# this makes it clear that apply is executed a column at a time (axis=0 is default)
dfabs.apply(lambda x: np.cumsum(x))

Unnamed: 0,A,B,C
One,0.561319,0.635798,0.125204
Two,1.567456,3.085881,0.516098
Three,3.428985,4.219477,0.7617
Four,3.651856,5.908997,1.180651


In [57]:
# you can define any function, not just lambdas
def roundsum(x):
    return round(np.cumsum(x),2)

In [58]:
dfabs.apply(roundsum)

Unnamed: 0,A,B,C
One,0.56,0.64,0.13
Two,1.57,3.09,0.52
Three,3.43,4.22,0.76
Four,3.65,5.91,1.18


In [59]:
dfabs

Unnamed: 0,A,B,C
One,0.561319,0.635798,0.125204
Two,1.006137,2.450083,0.390894
Three,1.861529,1.133596,0.245603
Four,0.222871,1.68952,0.418951


In [60]:
# a row at a time
dfabs.apply(roundsum,axis=1)

Unnamed: 0,A,B,C
One,0.56,1.2,1.32
Two,1.01,3.46,3.85
Three,1.86,3.0,3.24
Four,0.22,1.91,2.33


In [61]:
# of course, you can simply extract a column or row and send it 
dfabs['A'].apply(roundfn)

One      0.56
Two      1.01
Three    1.86
Four     0.22
Name: A, dtype: float64

In [62]:
# and if you are going to do this, you can use the Series map function
dfabs['B'].map(roundfn)

One      0.64
Two      2.45
Three    1.13
Four     1.69
Name: B, dtype: float64

In [63]:
dfabs

Unnamed: 0,A,B,C
One,0.561319,0.635798,0.125204
Two,1.006137,2.450083,0.390894
Three,1.861529,1.133596,0.245603
Four,0.222871,1.68952,0.418951


In [64]:
dfabs.loc['Two'].apply(roundfn)

A    1.01
B    2.45
C    0.39
Name: Two, dtype: float64

In [65]:
# since round is a Python function you can use it directly
dfabs.loc['Two'].round(2)

A    1.01
B    2.45
C    0.39
Name: Two, dtype: float64

In [66]:
dfabs

Unnamed: 0,A,B,C
One,0.561319,0.635798,0.125204
Two,1.006137,2.450083,0.390894
Three,1.861529,1.133596,0.245603
Four,0.222871,1.68952,0.418951


In [67]:
# for built-in Python functions, apply is generally unnecessary
dfabs.round(2)

Unnamed: 0,A,B,C
One,0.56,0.64,0.13
Two,1.01,2.45,0.39
Three,1.86,1.13,0.25
Four,0.22,1.69,0.42


In [68]:
# try mapping two rows
dfabs.loc[['Two','Three']].map(lambda x: round(x,2))

AttributeError: 'DataFrame' object has no attribute 'map'

In [69]:
# use apply
dfabs.loc[['Two','Three']].apply(lambda x: round(x,2))

Unnamed: 0,A,B,C
Two,1.01,2.45,0.39
Three,1.86,1.13,0.25


---

##### <font color="brown">applymap, for one item of a DataFrame at a time</font>

In [70]:
dfabs.applymap(lambda x: round(x,2))  

Unnamed: 0,A,B,C
One,0.56,0.64,0.13
Two,1.01,2.45,0.39
Three,1.86,1.13,0.25
Four,0.22,1.69,0.42


In [71]:
dfabs  # original not changed

Unnamed: 0,A,B,C
One,0.561319,0.635798,0.125204
Two,1.006137,2.450083,0.390894
Three,1.861529,1.133596,0.245603
Four,0.222871,1.68952,0.418951


In [72]:
dfabs.applymap(lambda x: np.round(np.cumsum(x),2))

Unnamed: 0,A,B,C
One,[0.56],[0.64],[0.13]
Two,[1.01],[2.45],[0.39]
Three,[1.86],[1.13],[0.25]
Four,[0.22],[1.69],[0.42]


---

#### <font color="brown">Iterating over rows and columns of DataFrame</font>

##### <font color="brown">Iterating over rows using iterrows</font>

In [73]:
dfabs

Unnamed: 0,A,B,C
One,0.561319,0.635798,0.125204
Two,1.006137,2.450083,0.390894
Three,1.861529,1.133596,0.245603
Four,0.222871,1.68952,0.418951


In [74]:
dfround = dfabs.round(2)
dfround

Unnamed: 0,A,B,C
One,0.56,0.64,0.13
Two,1.01,2.45,0.39
Three,1.86,1.13,0.25
Four,0.22,1.69,0.42


In [75]:
for row in dfround.iterrows():
    print(row,'\n')

('One', A    0.56
B    0.64
C    0.13
Name: One, dtype: float64) 

('Two', A    1.01
B    2.45
C    0.39
Name: Two, dtype: float64) 

('Three', A    1.86
B    1.13
C    0.25
Name: Three, dtype: float64) 

('Four', A    0.22
B    1.69
C    0.42
Name: Four, dtype: float64) 



In [76]:
for row in dfround.iterrows():
    ser = row[1]
    print(ser.index)
    print(ser.values)
    print('\n')

Index(['A', 'B', 'C'], dtype='object')
[0.56 0.64 0.13]


Index(['A', 'B', 'C'], dtype='object')
[1.01 2.45 0.39]


Index(['A', 'B', 'C'], dtype='object')
[1.86 1.13 0.25]


Index(['A', 'B', 'C'], dtype='object')
[0.22 1.69 0.42]




In [77]:
# retrieving column values individually
for row in dfround.iterrows():
    ser = row[1]
    for i,col in enumerate(ser.index):
        print(f'{col}: {ser.values[i]}')
    print('\n')

A: 0.56
B: 0.64
C: 0.13


A: 1.01
B: 2.45
C: 0.39


A: 1.86
B: 1.13
C: 0.25


A: 0.22
B: 1.69
C: 0.42




##### <font color="brown">Iterating over columns using iteritems</font>

In [78]:
dfround

Unnamed: 0,A,B,C
One,0.56,0.64,0.13
Two,1.01,2.45,0.39
Three,1.86,1.13,0.25
Four,0.22,1.69,0.42


In [79]:
for col in dfround.iteritems():
    print(col)
    print('\n')

('A', One      0.56
Two      1.01
Three    1.86
Four     0.22
Name: A, dtype: float64)


('B', One      0.64
Two      2.45
Three    1.13
Four     1.69
Name: B, dtype: float64)


('C', One      0.13
Two      0.39
Three    0.25
Four     0.42
Name: C, dtype: float64)




---

---

#### <font color="brown">Grouping</font>

##### <font color="brown">Example 1a: State populations by year</font>

In [80]:
popdat = {'state': ['Arizona','Arizona','Arizona','Virginia','Virginia'],
          'year': [2005, 2010, 2015, 2010, 2015],
          'pop': [5.9, 6.6, 6.8, 7.9, 8.3]}
popdf = DataFrame(popdat)
popdf

Unnamed: 0,state,year,pop
0,Arizona,2005,5.9
1,Arizona,2010,6.6
2,Arizona,2015,6.8
3,Virginia,2010,7.9
4,Virginia,2015,8.3


In [81]:
# first group by year
yrgrp = popdf.groupby('year')
yrgrp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000276FFAEF610>

In [82]:
# then sum up within each group
df = yrgrp.sum()  # pop is the only numeric column, so sum applies to it
df

Unnamed: 0_level_0,pop
year,Unnamed: 1_level_1
2005,5.9
2010,14.5
2015,15.1


In [83]:
df.index.name

'year'

In [84]:
df.reset_index()  # so we can have year as a column

Unnamed: 0,year,pop
0,2005,5.9
1,2010,14.5
2,2015,15.1


In [85]:
# in one shot
popdf.groupby('year').sum().reset_index()

Unnamed: 0,year,pop
0,2005,5.9
1,2010,14.5
2,2015,15.1


##### <font color="brown">Example 1b: State populations and debt by year</font>

In [86]:
popdf['debt'] = Series([1.2,1.2,1.1,0.9,1.2])
popdf

Unnamed: 0,state,year,pop,debt
0,Arizona,2005,5.9,1.2
1,Arizona,2010,6.6,1.2
2,Arizona,2015,6.8,1.1
3,Virginia,2010,7.9,0.9
4,Virginia,2015,8.3,1.2


In [87]:
popdf.groupby('year').sum().reset_index()

Unnamed: 0,year,pop,debt
0,2005,5.9,1.2
1,2010,14.5,2.1
2,2015,15.1,2.3


##### <font color="brown">Example 2a: School graduates by year</font>

In [88]:
grads = pd.read_csv(open('graduates.csv'))
grads

Unnamed: 0,Student School,Graduating Year,Major
0,Rutgers,2012,CS
1,Penn State,2011,EE
2,Princeton,2013,Psychology
3,MIT,2010,Physics
4,Rutgers,2018,Math
5,Penn State,2019,Economics
6,MIT,2017,CS
7,Penn State,2015,Biology
8,Rutgers,2013,Philosophy
9,Princeton,2012,Economics


**Q. How many grads in 2012?**

In [89]:
gdf = grads.groupby('Graduating Year').count()
gdf

Unnamed: 0_level_0,Student School,Major
Graduating Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,1,1
2011,1,1
2012,3,3
2013,3,3
2015,3,3
2017,1,1
2018,2,2
2019,1,1


**The count function applied to each group**

In [90]:
gdf.loc[2012]['Major']

3

##### <font color="brown">Example 2b: School graduates by major, and within each major, by year</font>

**Q. How many Econ grads in 2105?**

**2-level grouping, first by major, then by graduating year within major**

In [100]:
gdf2 = grads.groupby(['Major', 'Graduating Year']).count()
gdf2

Unnamed: 0_level_0,Unnamed: 1_level_0,Student School
Major,Graduating Year,Unnamed: 2_level_1
Biology,2015,1
CS,2012,2
CS,2017,1
CS,2018,1
EE,2011,1
Economics,2012,1
Economics,2015,2
Economics,2019,1
Math,2018,1
Philosophy,2013,1


In [95]:
gdf2.loc['Economics',2015]

Student School    2
Name: (Economics, 2015), dtype: int64

In [102]:
gdf2.loc['Economics']

Unnamed: 0_level_0,Student School
Graduating Year,Unnamed: 1_level_1
2012,1
2015,2
2019,1


In [97]:
type(gdf2.loc['Economics',2015])

pandas.core.series.Series

In [98]:
gdf2.loc['Economics',2015]['Student School']

2

In [104]:
gdf2_reset = gdf2.reset_index()
gdf2_reset

Unnamed: 0,Major,Graduating Year,Student School
0,Biology,2015,1
1,CS,2012,2
2,CS,2017,1
3,CS,2018,1
4,EE,2011,1
5,Economics,2012,1
6,Economics,2015,2
7,Economics,2019,1
8,Math,2018,1
9,Philosophy,2013,1


In [108]:
gdf2_reset[(gdf2_reset['Major'] == 'Economics') & (gdf2_reset['Graduating Year'] == 2015)]

Unnamed: 0,Major,Graduating Year,Student School
6,Economics,2015,2


In [106]:
ser = gdf2_reset[(gdf2_reset['Major'] == 'Economics') & (gdf2_reset['Graduating Year'] == 2015)]['Student School']
ser

6    2
Name: Student School, dtype: int64

In [109]:
ser.values[0]

2

##### <font color="brown">Example 2c: Value Counts</font>

**Q. What are the top majors by number of graduates?**

In [110]:
grads

Unnamed: 0,Student School,Graduating Year,Major
0,Rutgers,2012,CS
1,Penn State,2011,EE
2,Princeton,2013,Psychology
3,MIT,2010,Physics
4,Rutgers,2018,Math
5,Penn State,2019,Economics
6,MIT,2017,CS
7,Penn State,2015,Biology
8,Rutgers,2013,Philosophy
9,Princeton,2012,Economics


In [111]:
major_counts = grads['Major'].value_counts()
major_counts

CS            4
Economics     4
Psychology    2
EE            1
Physics       1
Math          1
Biology       1
Philosophy    1
Name: Major, dtype: int64

In [112]:
major_counts[major_counts == major_counts.max()]

CS           4
Economics    4
Name: Major, dtype: int64

In [113]:
major_counts[major_counts == major_counts.max()].index.tolist()

['CS', 'Economics']

**Note above that an index can be converted to a list with the tolist method**

---

---

#### <font color="brown">Dropping rows or columns (variation of del operation for column)</font>

In [114]:
nparr = np.random.random((4,3))
randdf = DataFrame(nparr,index=['four','one','three','two'],columns=['first','second','third'])
randdf

Unnamed: 0,first,second,third
four,0.388045,0.241444,0.059673
one,0.818102,0.497951,0.232546
three,0.881332,0.796727,0.19383
two,0.843796,0.860696,0.862067


**Dropping rows**

In [117]:
randdf2 = randdf.drop(['four','three'])
randdf2

Unnamed: 0,first,second,third
one,0.818102,0.497951,0.232546
two,0.843796,0.860696,0.862067


In [118]:
randdf  # is original changed? NO

Unnamed: 0,first,second,third
four,0.388045,0.241444,0.059673
one,0.818102,0.497951,0.232546
three,0.881332,0.796727,0.19383
two,0.843796,0.860696,0.862067


In [119]:
rfcopy = randdf.copy()
del rfcopy['second']
rfcopy  # is origial changed?

Unnamed: 0,first,third
four,0.388045,0.059673
one,0.818102,0.232546
three,0.881332,0.19383
two,0.843796,0.862067


In [120]:
# trying to drop column "third" 
# Only row can drop
randdf.drop(['third'])

KeyError: "['third'] not found in axis"

In [121]:
randdf.drop(['third'],axis=1)

Unnamed: 0,first,second
four,0.388045,0.241444
one,0.818102,0.497951
three,0.881332,0.796727
two,0.843796,0.860696


In [122]:
randdf

Unnamed: 0,first,second,third
four,0.388045,0.241444,0.059673
one,0.818102,0.497951,0.232546
three,0.881332,0.796727,0.19383
two,0.843796,0.860696,0.862067


In [131]:
rcopy = randdf.copy()
rcopy.drop(['three','four'],inplace=True)
rcopy.drop(['third'],axis=1)
rcopy

Unnamed: 0,first,second,third
one,0.818102,0.497951,0.232546
two,0.843796,0.860696,0.862067


In [21]:
import re
w =input()
w = w.replace(' ', '')
r = "".join(reversed(w))
start, end = re.match(w,r).span()
if(end == len(w)):
    print('It is palindrome')

elif(end != len(w)):
    print('It is not palindrome')

AttributeError: 'NoneType' object has no attribute 'span'

In [37]:
import re
w =input()
w = w.replace('', '')
r = "".join(reversed(w))
start, end = re.match(w,r).span()
if(end == len(w)):
    print('It is palindrome')
else:
   print('It is not palindrome')


AttributeError: 'NoneType' object has no attribute 'span'

In [15]:
import re

f = input()
a="(\d\.+)*^[1-9]\d+[1-9]+(E|e)$"
if re.match(a,f):
        print('True')
else:
        print('False')


Flase


In [25]:
import numpy as np
nparr = nparr = np.array([9,6,2,5,7,3,8])

nparr[nparr%2==1] = nparr[nparr%2==1]*2

print(nparr)



[0 6 2 0 0 0 8]


In [29]:
import re

RE = '^[0-9][a-zA-z0-9]*$'
w = input()
result = bool(re.match(RE,w))

print(result)

False


In [38]:
small = sorted(nparr[0,:])[:2]
large = sorted(nparr[-1,:])[:2]

smallest = np.where(nparr[0,:] == small[0])
largest = np.where(nparr[-1,:] == large[0])
nparr[0,smallest],nparr[-1,largest] = nparr[-1,largest],nparr[0,smallest]

smallest2 = np.where(nparr[0,:] == small[1])
largest2 = np.where(nparr[-1,:] == large[1])
nparr[0,smallest2],nparr[-1,largest2] = nparr[-1,largest2],nparr[0,smallest2]

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [48]:
import numpy as np
 
allscore = np.array([54,36,63,87])
newarr = []

res = int(allscore.mean())
newarr = allscore > res

print(res)
print(newarr)


60
[False False  True  True]


In [None]:
resultArr = score[:,np.sum(score,axis=0) > np.mean(np.sum(score,axis=0))]

In [None]:
resultArr = score[:,np.sum(score,axis=0) > np.mean(np.sum(score,axis=0))]

In [49]:
r'"(.*?)", (\d\d\d\d), \$(\d+\.\d\d)'

'"(.*?)", (\\d\\d\\d\\d), \\$(\\d+\\.\\d\\d)'

In [None]:
res = scores[:,np.sum(scores,axis=0)&gt;np.mean(np.sum(scores,axis=0))]

res = score[:,np.sum(score,axis=0) > np.mean(np.sum(score,axis=0))]