In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
%config IPCompleter.greedy=True

In [98]:
path = 'tom_slee/'
filename = 'all_tom_slee_listings.csv'
df = pd.read_csv(path + filename, encoding='ISO-8859-1', low_memory=False)

In [101]:
if 'last_review' in df:
    df = listings[listings.last_review > '0'] \
    .sort_values(by='last_review', axis=0)

In [4]:
def find_m2_in_txt(lines):
    return [s for s in lines if any(xs in s for xs in ['m2'])]

In [5]:
# extract the number of m2 for each string in a list that contains string 'm2'
def m2_amount(ls):
    answer = ''
    for line in ls:
        match = re.search(r'\b(\d*)?m2\b', line)
        if match is not None and len(match.groups()) > 0:
            answer = match.group(1)
            has_digits = any(char.isdigit() for char in answer)
            if has_digits:
                break
            else:
                previous_word_match = re.search(r'(\d*) (\bm2\b)', line)
                if previous_word_match is not None and len(previous_word_match.groups()) > 0:
                    answer = previous_word_match.group(1)
                    break
    return answer

In [6]:
# for a column-name in a dataframe, check in values contain 'm2'
# then get the associated amount of m2, replace NaN with empty string
def find_m2_for_column(frame, colname):
    matched_substrs = frame[frame[colname].str.contains(r'm2', na=False)] \
    [colname].str.split('.')
    
    result = matched_substrs.apply(find_m2_in_txt) \
    .apply(lambda x: m2_amount(x) if type(x) is list else '') 

    # convert all NaN values to empty string
    result.loc[result.isnull()] = ''
    
    return result 

In [7]:
def strip_nan(frame):
    return frame.apply(lambda x: x if type(x) is str else '')

In [8]:
def extract_m2_to_column(df, col_target='', col_output='', stripNaN=True):
    df[col_output] = find_m2_for_column(df, col_target)
    if stripNaN:
        df[col_output] = strip_nan(df[col_output])
    return df[col_output]

In [103]:
df.columns

Index(['Unnamed: 0', 'accommodates', 'bathrooms', 'bedrooms', 'borough',
       'city', 'country', 'host_id', 'last_modified', 'latitude', 'location',
       'longitude', 'minstay', 'name', 'neighborhood', 'overall_satisfaction',
       'price', 'reviews', 'room_id', 'room_type', 'survey_id'],
      dtype='object')

In [104]:
targets = ['name']

In [105]:
def extract_m2_target_columns(df, target_columns):
    output_cols = []
    for tc in target_columns:
        output_col_name = 'm2_' + tc
        output_cols.append(output_col_name)
        extract_m2_to_column(df, col_target=tc, col_output= output_col_name)
    return output_cols

In [106]:
temporary_columns = extract_m2_target_columns(df, target_columns=targets); temporary_columns

['m2_name']

In [108]:
def combine_m2_columns(df, candidates=[]):
    for c in candidates:
        if not 'm2' in df:
            df['m2'] = df[c]
        elif len(df[c]) > len(df['m2']):
            df['m2'] = df[c]
    return df

In [109]:
def replace_empty_values(df, target_col, cols):
    for c in cols:
        np.where((df[target_col] == '') & \
                (df[c] != ''), df[c], df[target_col])

In [110]:
combined = combine_m2_columns(df, temporary_columns)
replace_empty_values(df, 'm2', temporary_columns)
unempty_m2 = df[df['m2'] != '']['m2']
print('total:', df['host_id'].count(), ', with_m2:' , unempty_m2.count() \
    , ', percentage: ', unempty_m2.count() / df['host_id'].count() * 100)


total: 221067 , with_m2: 1726 , percentage:  0.780758774489


In [111]:
# drop the temporary columns
filtered_df = df.drop(
    temporary_columns
    , axis=1)

In [112]:
# Exort the results!
filtered_df.to_csv(path + 'all_listings_summary_with_m2.csv')