In [150]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold, train_test_split
from importlib import reload

import preprocess as pre
import timeit

# Read in the CL data

In [104]:
# df = pd.read_csv('data/no_dupes_lda_fit5_18.csv')
# This has the latest preproc texts
df = pd.read_csv('data/no_dupes_lda_fit5_18.csv', index_col=0, dtype = {'GEOID10':object,'blockid':object, 'postid':object})

In [105]:
len(df)

14748

In [5]:
# Compare count and overlap
black = df.index[df['high_black'] == True].tolist()
white = df.index[df['high_white'] == True].tolist()
asian = df.index[df['high_asian'] == True].tolist()

In [6]:
# Should I remove these from analysis?
len(set(black).intersection(set(white)))

2196

In [7]:
overlap = sorted(list(set(black).intersection(set(white))))

In [8]:
df = df.drop(df.index[overlap])

In [9]:
len(df)

12552

In [10]:
len(set(black).intersection(set(asian)))

3168

In [11]:
len(set(white).intersection(set(asian)))

1218

In [12]:
len(black)

6385

In [13]:
len(white)

8144

In [14]:
len(asian)

6208

woah, asian & white neighbs have the lowest overlap

# 0. Are titles alone predictive?

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df['listingTitle'], df['high_white'], random_state=0)

In [16]:
word_vectorizer = CountVectorizer()
word_vectorizer.fit(X_train)
X_train_vectorized = word_vectorizer.transform(X_train)
model = LogisticRegression(C=.1).fit(X_train_vectorized, y_train)
predictions = model.predict_proba(word_vectorizer.transform(X_test))[:,1]
binary_pred = [0 if value <= 0.5 else 1 for value in predictions]

print('AUC: ', roc_auc_score(y_test, predictions))
print('F1 score: ', f1_score(y_test, binary_pred))
print('accuracy: ', accuracy_score(y_test, binary_pred))

AUC:  0.7982795647259258
F1 score:  0.6679551604174719
accuracy:  0.7262587635436584


# Not really.

# Read in neighborhood names

In [2]:
with open('resources/hoods.txt', 'r') as inf:
    hoodnames = inf.read().splitlines()
    #hoodnames = inf.read()
    #hoodnames = re.split(r',\s*', hoodnames)

# Curated list of Seattle-area neighborhoods -- some manually added in.

In [3]:
neighbs = """"'Adams' 'Alki' 'Arbor Heights' 'Atlantic' 'Ballard' 'Belltown' 'Bellevue' Bitter Lake'
 'Bothell' 'Bremerton' 'Briarcliff' 'Brighton' 'Broadview' 'Broadway' 'Bryant' 'Capitol Hill' 'Cedar Park'
 'Central Business District' 'Columbia City' 'Crown Hill' 'Dunlap'
 'East Queen Anne' 'Eastlake' 'Everett' 'Fairmount Park' 'Fauntleroy' 'Federal Way' 'First Hill'
 'Fremont' 'Gatewood' 'Genesee' 'Georgetown' 'Green Lake' 'Greenlake' 'Greenwood'
 'Haller Lake' 'Harrison/Denny-Blaine' 'High Point' 'Highland Park'
 'Holly Park' 'Industrial District' 'Interbay' 'International District' 'Issaquah' 'Kirkland'
 'Laurelhurst' 'Lawton Park' 'Leschi' 'Lower Queen Anne' 'Loyal Heights'
 'Madison Park' 'Madrona' 'Mann' 'Maple Leaf' 'Matthews Beach'
 'Meadowbrook' 'Mid-Beacon Hill' 'Mill Creek' Minor' 'Montlake' 'Mount Baker' 'Newcastle'
 'North Admiral' 'North Beach/Blue Ridge' 'North Beacon Hill'
 'North College Park' 'North Delridge' 'North Queen Anne' 'Olympic Hills'
 'Phinney Ridge' 'Pike-Market' 'Pinehurst' 'Pioneer Square' 'Portage Bay'
 'Rainier Beach' 'Ravenna' 'Redmond' 'Renton' 'Riverview' 'Roosevelt' 'Roxhill' 'Seaview'
 'Seward Park' 'Shoreline' 'South Beacon Hill' 'South Delridge' 'South Lake Union'
 'South Park' 'Southeast Magnolia' 'Stevens' 'Sunset Hill'
 'University District' 'U District' 'UDistrict' 'Victory Heights' 'View Ridge' 'Wallingford'
 'Wedgwood' 'West Seattle' 'West Queen Anne' 'West Woodland' 'Westlake'
 'Whittier Heights' 'Windermere' 'Yesler Terrace'""".split("'")

In [4]:
hoods = [name.lower() for name in neighbs if re.match(r'\w+', name)]

In [5]:
hoods

['adams',
 'alki',
 'arbor heights',
 'atlantic',
 'ballard',
 'belltown',
 'bellevue',
 'bothell',
 'bremerton',
 'briarcliff',
 'brighton',
 'broadview',
 'broadway',
 'bryant',
 'capitol hill',
 'cedar park',
 'central business district',
 'columbia city',
 'crown hill',
 'dunlap',
 'east queen anne',
 'eastlake',
 'everett',
 'fairmount park',
 'fauntleroy',
 'federal way',
 'first hill',
 'fremont',
 'gatewood',
 'genesee',
 'georgetown',
 'green lake',
 'greenlake',
 'greenwood',
 'haller lake',
 'harrison/denny-blaine',
 'high point',
 'highland park',
 'holly park',
 'industrial district',
 'interbay',
 'international district',
 'issaquah',
 'kirkland',
 'laurelhurst',
 'lawton park',
 'leschi',
 'lower queen anne',
 'loyal heights',
 'madison park',
 'madrona',
 'mann',
 'maple leaf',
 'matthews beach',
 'meadowbrook',
 'mid-beacon hill',
 'mill creek',
 'montlake',
 'mount baker',
 'newcastle',
 'north admiral',
 'north beach/blue ridge',
 'north beacon hill',
 'north colleg

# Preprocess the data:
- Strip URLs (or.. should map them to '#url' ???)
- Map neighborhood names to '#hood'
- Tokenize words & punctuation


- don't use 'clean_text' yet since it has some preproc errors

In [6]:
#url_pattern = r'(https?:\/\/)?(www)?.*[\r\n]*'

punctuation_pattern = r"[#\w'-]+|[.,!?;]+"
url_pattern = r'(http)?(www)?\S*(\.com|\.net|\.gov|\.be|\.org)\S*'

In [7]:
punctuation_pattern = r"[#\w'-]+|[.,!?;]+"

In [8]:
def preprocess(text):
    text = text.lower()
    no_urls = re.sub(url_pattern, '', text)
    for hood in hoodnames:
        # hood_pattern = r'\s+{0}\s+'.format(hood)
        #hood_pattern = r' ?'+hood+' ?'
        # Match neighborhood mentions surrounded by whitespace and replace with #hood
        no_urls = re.sub(r'\W+{0}\W+'.format(hood), ' #hood ', no_urls)
    no_digits = re.sub(r'\d+', '', no_urls)
    tokenized = re.findall(punctuation_pattern, no_digits)
    return ' '.join([word.lower() for word in tokenized])

In [9]:
short_example = "this queen anne apartment is really cool !!! 98105 https://blah.com"

In [32]:
long_example = df.loc[100]['listingText']
long_example

'QR Code Link to This Post\r\nContact info:\r\nLindsay |\r\nshow contact info\r\nMINUTES WALK TO GREENLAKE! BEAUTIFULLY REMODELED & NOW AVAILABLE!\r\n6515 5th Ave NE #102, Seattle, WA 98115\r\n$1,350/mo\r\nKEY FEATURES\r\nSq Footage:\r\n700 sqft.\r\nBedrooms:\r\n1 Bed\r\nBathrooms:\r\n1 Bath\r\nParking:\r\n2 Off street\r\nLease Duration:\r\n6 Months (See Details Below)\r\nDeposit:\r\n$600\r\nPets Policy:\r\nNo Pets Allowed\r\nLaundry:\r\nShared\r\nProperty Type:\r\nApartment\r\nDESCRIPTION\r\nWelcome to Greenlake Park Apartments; a smaller 7 unit building located just minutes walk to Greenlake! This spacious one bedroom apartment has just had a complete remodel with hardwood flooring, stainless appliances, granite, lighting, the works! The apartment home is available for immediate occupancy!\r\nWe have off street parking and additional storage. Laundry facilities are conveniently located on-site. Enjoy a flexible 6-12 month lease term.\r\nEnjoy living in Greenlake where you can walk to

In [13]:
preprocess(short_example)

'this #hood apartment is really cool !!!'

In [17]:
pre.cl_clean_text(pd.Series(short_example))

0    this queen anne apartment is really cool !!!_ URL
dtype: object

In [25]:
preprocess(long_example)

'qr code link to this post property description enjoy this inviting #hood rambler just minutes from #hood square mall , lincoln square , and all the amenities #hood hood has to offer . this home features hardwood floors , vaulted ceilings , and plenty of natural light . a spacious partially finished daylight basement features an additional living room , craft kitchen , and walk out to a full fenced yard . located in the acclaimed #hood school #hood and just a short walk to the #hood public library , this is the ideal home for everyone . pets are considered on a case by case basis . please contact adrian villanueva at show contact info or show contact info for more information . details availability date february , pets cats , small dogs lb pet notes pets determined on a case by case basis deposit . application fee amenities laundry in unit parking type covered parking notes attached carport appliances dishwasher , refrigerator , microwave , range oven exterior yard-fenced , porch heati

In [21]:
pre.cl_clean_text(pd.Series(long_example)).values

array(['Erica Property Management show contact info show contact info N th St Apt Seattle WA Cozy bedroom bathroom washerdryer in unit storage on balcony lots of space !___ BR BA Apartment $___ month Bedrooms Bathrooms full partial Sq Footage Parking Garage $___ Street Parking Pet Policy No pets Deposit $___ Non Refundable Administrative Fee $___ Application Fee $___ per person over years of age DESCRIPTION Gorgeous story newly remodeled complex equipped with elevators and garage parking Conveniently located near shopping centers schools major bus lines and easy I access see additional photos below RENTAL FEATURES Living room OfficeDen Dishwasher Refrigerator StoveOven Washer Dryer Balcony Deck or Patio Cableready Highspeed internet COMMUNITY FEATURES Garage parking Storage spaces Secured entry Elevator New property years LEASE TERMS MONTHS Contact info Erica Property Management show contact info show contact info Posted Nov pm PST'],
      dtype=object)

In [28]:
texts = [[word for word in text.lower().split() if word not in hoodnames] for text in df.clean_text]

In [29]:
# Apply preproc to all texts
df['preproc_text'] = df['body_text'].apply(preprocess)

In [163]:
text = pd.Series('6th streeet is the bestest klsjdlfkj ?? ~@!! jurekj.sdlfjs')
text2 = df.listingText[df.postid== '6567473317.0'].values
test = df.iloc[200:1200].copy()
reload(pre)

<module 'preprocess' from '/Users/ikennedy/OneDrive - UW/UW/GIT/cl_lda/preprocess.py'>

In [164]:
with open('resources/seattle_stop_words.txt') as f:
    neighborhoods = f.read().splitlines()
from sklearn.feature_extraction import stop_words
stop_words = neighborhoods + list(stop_words.ENGLISH_STOP_WORDS)
stopword_pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*', flags=re.IGNORECASE)
punctuation_pattern = r"[#\w']+|[!?]+"
url_pattern = r'(http)?(www)?\S*(\.com|\.net|\.gov|\.be|\.org)\S*'
short_pattern = r'^\b\w{1,3}\b | \b\w{1,3}\b'
text = (text.str.lower() # make lowercase
       .str.replace(stopword_pattern, '') # drop neighborhoods and other stopwords
       .str.replace(url_pattern, '') # drop urls
       .str.replace(r'\d+', '') # drop digits
       .str.findall(punctuation_pattern) # drop most punctuation
       .str.join(' ') # join after punctuation drop
       .str.replace(short_pattern, '')) # drop words with less than 3 characters
text.values

array(['streeet bestest klsjdlfkj ?? !! jurekj sdlfjs'], dtype=object)

In [153]:
%timeit pre.preprocess(test.listingText)

3.47 s ± 108 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [154]:
%timeit test['preproc_text'] = test['listingText'].apply(preprocess)

16.7 s ± 150 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [155]:
# Save her for faster loading!
df['preproc_text'] = pre.preprocess(df.listingText)
df.to_csv('5_25_preproc.csv')

# Train & test models.
## Split the data into train & test sets
## First, binary classif: high white vs not

In [31]:
X_train, X_test, y_train, y_test = train_test_split(df['preproc_text'], df['high_white'], random_state=0)

In [32]:
word_vectorizer = CountVectorizer()

In [33]:
X_train_vectorized = word_vectorizer.fit_transform(X_train)

In [34]:
# Logistic regression model
model = LogisticRegression(C=.1).fit(X_train_vectorized, y_train)

In [35]:
predictions = model.predict_proba(word_vectorizer.transform(X_test))[:,1]

In [36]:
binary_pred = [0 if value <= 0.5 else 1 for value in predictions] #just use model.predict for these..

In [37]:
print('AUC: ', roc_auc_score(y_test, predictions))
print('F1 score: ', f1_score(y_test, binary_pred))
print('accuracy: ', accuracy_score(y_test, binary_pred))

AUC:  0.8585746093380979
F1 score:  0.7699530516431925
accuracy:  0.781389420012747


In [38]:
fpr, tpr, thresholds = roc_curve(y_test, predictions)
roc_auc = auc(fpr, tpr)

In [39]:
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

ax2 = plt.gca().twinx()
ax2.plot(fpr, thresholds, markeredgecolor='r',linestyle='dashed', color='r')
ax2.set_ylabel('Threshold',color='r')
ax2.set_ylim([thresholds[-1],thresholds[0]])
ax2.set_xlim([fpr[0],fpr[-1]])

(0.0, 1.0)

In [40]:
feature_names = np.array(word_vectorizer.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1]
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:20]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[-20:]]))

Smallest Coefs:
['rail' 'westwood' 'concierge' 'marymoor' 'mall' 'airport' 'rianna' 'moda'
 'aurora' 'harbor' 'riverpark' 'somerset' 'pools' 'jefferson' 'cedar'
 'stadiums' 'fountain' 'cleveland' 'borgata' 'grand']

Largest Coefs: 
['country' 'zoo' 'boutiques' 'domaine' 'village' 'esxpt' 'children'
 'inglenook' 'bernard' 'woodland' 'volunteer' 'northshore' 'point' 'odin'
 'springline' 'waterscape' 'beach' 'locks' 'urbana' 'nw']


In [41]:
# Five-fold cross validation
kf = KFold(n_splits=5)
X, y = df['preproc_text'], df['high_white']
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    word_vectorizer.fit(X_train)
    X_train_vectorized = word_vectorizer.fit_transform(X_train)
    model = LogisticRegression(C=.1).fit(X_train_vectorized, y_train)
    predictions = model.predict_proba(word_vectorizer.transform(X_test))[:,1]
    binary_pred = [0 if value <= 0.5 else 1 for value in predictions]
    print('AUC: ', roc_auc_score(y_test, predictions))
    print('F1 score: ', f1_score(y_test, binary_pred))
    print('accuracy: ', accuracy_score(y_test, binary_pred))
    feature_names = np.array(word_vectorizer.get_feature_names())

    # Sort the coefficients from the model
    sorted_coef_index = model.coef_[0].argsort()

    # Find the 10 smallest and 10 largest coefficients
    # The 10 largest coefficients are being indexed using [:-11:-1]
    # so the list returned is in order of largest to smallest
    print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:20]]))
    print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[-20:]]))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


ValueError: np.nan is an invalid document, expected byte or unicode string.

# Using ngram features

In [42]:
# remake train test split
X_train, X_test, y_train, y_test = train_test_split(df['preproc_text'], df['high_white'], random_state=0)

In [43]:
ngram_vectorizer = CountVectorizer(ngram_range=(1,4)).fit(X_train)

In [44]:
X_train_ngrams = ngram_vectorizer.transform(X_train)
# Logistic regression model
#model = LogisticRegression(C=.5).fit(X_train_ngrams, y_train)
model = LogisticRegression(C=.5, penalty='l2').fit(X_train_ngrams, y_train)
predictions = model.predict_proba(ngram_vectorizer.transform(X_test))[:,1]

In [71]:
binary_pred = [0 if value <= 0.5 else 1 for value in predictions]

In [72]:
print('AUC: ', roc_auc_score(y_test, predictions))
print('F1 score: ', f1_score(y_test, binary_pred))
print('accuracy: ', accuracy_score(y_test, binary_pred))

AUC:  0.8898263399125023
F1 score:  0.798804780876494
accuracy:  0.8068833652007649


In [73]:
feature_names = np.array(ngram_vectorizer.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:20]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[-20:]]))

Smallest Coefs:
['mall' 'rail' 'airport' 'light rail' 'concierge' 'south'
 'hood university' 'gym' 'shopping' 'center' 'college' 'marymoor'
 'th ave hood' 'story' 'south hood' 'section' 'near' 'marymoor park'
 'to shopping' 'westwood']

Largest Coefs: 
['uw' 'ave nw' 'children' 'locks' 'hood beach' 'west' 'charming'
 'hood ave' 'basement' 'point' 'smoking' 'laundry' 'village' 'on hood'
 'deck' 'hood village' 'market' 'shops' 'beach' 'nw']


In [74]:
# CV with ngram on high white
kf = KFold(n_splits=5)
X, y = df['preproc_text'], df['high_white']
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_vectorized = ngram_vectorizer.fit_transform(X_train)
    model = LogisticRegression(C=.5).fit(X_train_vectorized, y_train)
    predictions = model.predict_proba(ngram_vectorizer.transform(X_test))[:,1]
    binary_pred = [0 if value <= 0.5 else 1 for value in predictions]
    print('AUC: ', roc_auc_score(y_test, predictions))
    print('F1 score: ', f1_score(y_test, binary_pred))
    print('accuracy: ', accuracy_score(y_test, binary_pred))
    feature_names = np.array(ngram_vectorizer.get_feature_names())

    # Sort the coefficients from the model
    sorted_coef_index = model.coef_[0].argsort()

    # Find the 10 smallest and 10 largest coefficients
    # The 10 largest coefficients are being indexed using [:-11:-1]
    # so the list returned is in order of largest to smallest
    print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:20]]))
    print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[-20:]]))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
# AP note (04/14) The neighborhood names are still not complete enough -_-
# also they are still sneaking in somehow... need more work on preproc
# 'kirkland' and 'issaquah' are IN the damn hoods list.. why are they still showing up?!
# 5/22: updated regex in preproc, does much better now!

# OK, that was a binary prediction on high-white; let's see high-black

In [51]:
# Prob a faster way to just grab new labels...
X_train, X_test, y_train, y_test = train_test_split(df['preproc_text'], df['high_black'], random_state=0)

In [52]:
X_train_ngrams = ngram_vectorizer.transform(X_train)
# Logistic regression model
model = LogisticRegression(C=.5).fit(X_train_ngrams, y_train)
predictions = model.predict_proba(ngram_vectorizer.transform(X_test))[:,1]


#Parameters to try: l1 penalty instead of l2
# different regularization 

In [53]:
binary_pred = [0 if value <= 0.5 else 1 for value in predictions]

In [54]:
print('AUC: ', roc_auc_score(y_test, predictions))
print('F1 score: ', f1_score(y_test, binary_pred))
print('accuracy: ', accuracy_score(y_test, binary_pred))

AUC:  0.9070896497141245
F1 score:  0.7472647702407001
accuracy:  0.8527724665391969


In [55]:
feature_names = np.array(ngram_vectorizer.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:20]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[-20:]]))

Smallest Coefs:
['nw' 'ne' 'beach' 'uw' 'trails' 'trail' 'school hood' 'hood square'
 'newport' 'island' 'courtyard' 'hood trail' 'blocks' 'hood beach' 'old'
 'shops' 'hood park' 'car garage' 'ride' 'hardwood floors']

Largest Coefs: 
['gated' 'college' 'freeway' 'hood station' 'aurora' 'golf' 'ave hood'
 'north hood' 'rianna' 'station' 'th ave hood' 'mall' 'court' 'south'
 'concierge' 'light rail' 'south hood' 'airport' 'hood university' 'rail']


In [56]:
X_train_vectorized = word_vectorizer.transform(X_train)
model = LogisticRegression(C=.1, penalty='l1').fit(X_train_vectorized, y_train)
predictions = model.predict_proba(word_vectorizer.transform(X_test))[:,1]
binary_pred = [0 if value <= 0.5 else 1 for value in predictions]
print('AUC: ', roc_auc_score(y_test, predictions))
print('F1 score: ', f1_score(y_test, binary_pred))
print('accuracy: ', accuracy_score(y_test, binary_pred))

AUC:  0.8560934278030323
F1 score:  0.6311522872032427
accuracy:  0.7970044614404079


In [57]:
feature_names = np.array(word_vectorizer.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:20]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[-20:]]))

Smallest Coefs:
['nw' 'newport' 'ne' 'trader' 'winning' 'trails' 'gilman' 'volunteer'
 'beach' 'se' 'comprehensive' 'marymoor' 'trail' 'lincoln' 'surrey' 'coin'
 'springline' 'urbana' 'september' 'uw']

Largest Coefs: 
['mall' 'cityline' 'swedish' 'hills' 'stadiums' 'cedar' 'aurora' 'olympus'
 'helios' 'centennial' 'southcenter' 'sculpture' 'jefferson' 'concierge'
 'rail' 'airport' 'westwood' 'rianna' 'harbor' 'moda']


In [None]:
y_test.value_counts()


In [None]:
1627/2060

In [None]:
# Try SVM..

model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
model.fit(X_train_vectorized,y_train)
predicted = model.predict(word_vectorizer.transform(X_test))
np.mean(predicted == y_test)

In [None]:
model.score(word_vectorizer.transform(X_test), y_test)

In [None]:
# Five-fold cross validation on unigrams
kf = KFold(n_splits=5)
X, y = df['preproc_text'], df['high_black']
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    word_vectorizer.fit(X_train)
    X_train_vectorized = word_vectorizer.fit_transform(X_train)
    model = LogisticRegression(C=.5).fit(X_train_vectorized, y_train)
    predictions = model.predict_proba(word_vectorizer.transform(X_test))[:,1]
    binary_pred = [0 if value <= 0.5 else 1 for value in predictions]
    print('AUC: ', roc_auc_score(y_test, predictions))
    print('F1 score: ', f1_score(y_test, binary_pred))
    print('accuracy: ', accuracy_score(y_test, binary_pred))
    feature_names = np.array(word_vectorizer.get_feature_names())

    # Sort the coefficients from the model
    sorted_coef_index = model.coef_[0].argsort()
    print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:20]]))
    print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[-20:]]))

In [46]:
# CV with ngram
kf = KFold(n_splits=5)
X, y = df['preproc_text'], df['high_black']
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_vectorized = ngram_vectorizer.fit_transform(X_train)
    model = LogisticRegression(C=.5).fit(X_train_vectorized, y_train)
    predictions = model.predict_proba(ngram_vectorizer.transform(X_test))[:,1]
    binary_pred = [0 if value <= 0.5 else 1 for value in predictions]
    print('AUC: ', roc_auc_score(y_test, predictions))
    print('F1 score: ', f1_score(y_test, binary_pred))
    print('accuracy: ', accuracy_score(y_test, binary_pred))
    feature_names = np.array(ngram_vectorizer.get_feature_names())

    # Sort the coefficients from the model
    sorted_coef_index = model.coef_[0].argsort()
    print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:20]]))
    print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[-20:]]))

AUC:  0.8570017743120733
F1 score:  0.7233669443226654
accuracy:  0.7861016949152543
Smallest Coefs:
['nw' 'ne' 'hood square' 'uw' 'trails' 'newport' 'se' 'school hood'
 'mountains' 'beach' 'shops' 'hood park' 'trail' 'ave nw' 'courtyard'
 'square' 'ne hood wa' 'tour today' 'zoo' 'excellent']

Largest Coefs: 
['flooring' 'elliott' 'mall' 'moda' 'olive' 'hood station' 'ave hood'
 'harbor' 'pointe' 'th ave hood wa' 'south hood' 'golf' 'station'
 'th ave hood' 'concierge' 'court' 'light rail' 'airport'
 'hood university' 'rail']
AUC:  0.860598941466387
F1 score:  0.7143507972665147
accuracy:  0.7874576271186441
Smallest Coefs:
['nw' 'ne' 'uw' 'hood square' 'trails' 'school hood' 'hood park' 'se'
 'shops' 'beach' 'island' 'newport' 'square' 'ave nw' 'mountains' 'nw th'
 'zoo' 'starting' 'ne hood' 'trail']

Largest Coefs: 
['hood airport' 'south' 'golf course' 'the hood hood' 'community college'
 'valley' 'ave hood' 'olive' 'concierge' 'court' 'south hood'
 'hood station' 'station' 'pointe'