In [10]:
import pandas as pd
import numpy as np

university_towns = []
with open('DataSource/university_towns.txt') as file:
    for line in file:
        if '[edit]' in line:
            # Remember this `state` until the next is found
            state = line
        else:
            # Otherwise, we have a city; keep `state` as last-seen
            university_towns.append((state, line))

university_towns[:5]


[('Alabama[edit]\n', 'Auburn (Auburn University)[1]\n'),
 ('Alabama[edit]\n', 'Florence (University of North Alabama)\n'),
 ('Alabama[edit]\n', 'Jacksonville (Jacksonville State University)[2]\n'),
 ('Alabama[edit]\n', 'Livingston (University of West Alabama)[2]\n'),
 ('Alabama[edit]\n', 'Montevallo (University of Montevallo)[2]\n')]

In [12]:
towns_df = pd.DataFrame(university_towns,columns=['State', 'RegionName'])
towns_df.head()

Unnamed: 0,State,RegionName
0,Alabama[edit]\n,Auburn (Auburn University)[1]\n
1,Alabama[edit]\n,Florence (University of North Alabama)\n
2,Alabama[edit]\n,Jacksonville (Jacksonville State University)[2]\n
3,Alabama[edit]\n,Livingston (University of West Alabama)[2]\n
4,Alabama[edit]\n,Montevallo (University of Montevallo)[2]\n


In [14]:
# GOAL: We only need the state name and the town name and can remove everything else
# Lets use the applymap() to map a Python callable to each element of the DataFrame

def get_citystate(item):
    if ' (' in item:
        return item[:item.find(' (')]
    elif '[' in item:
        return item[:item.find('[')]
    else:
        return item
    
towns_df =  towns_df.applymap(get_citystate)
towns_df.head()

# The applymap() method took each element from the DataFrame, 
# passed it to the function, and the original value was 
# replaced by the returned value. It’s that simple!

Unnamed: 0,State,RegionName
0,Alabama,Auburn
1,Alabama,Florence
2,Alabama,Jacksonville
3,Alabama,Livingston
4,Alabama,Montevallo


In [15]:
# Technical Details:
# While it is a convenient and versatile method, 
# .applymap can have significant runtime for larger datasets, 
# because it maps a Python callable to each individual element. 
# In some cases, it can be more efficient to do vectorized operations 
# that utilize Cython or NumPY (which, in turn, makes calls in C) under the hood