# DataWorkshop - course NLP2 (2021)
#### https://dataworkshop.eu/pl/nlp
## NLP contest: 
#### https://www.kaggle.com/c/nlp-predicting-house-prices/leaderboard
### author: Jarosław Maksimowicz

In [1]:
import pandas as pd
import numpy as np
import re
np.random.seed(0)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb

pd.set_option('display.max_colwidth',None) ### show all text

!mkdir -p ../output

### load data
df_train = pd.read_hdf("../input/df.train.h5")
df_test = pd.read_hdf("../input/df.test.h5")

#print("loaded df_train & df_test: ", df_train.shape, df_test.shape)

df_train['price_value'] = df_train['price'].map(lambda x: x.split("zł")[0].replace(" ", "").replace(",", ".")).astype(float)

df = pd.concat([df_train, df_test], axis=0)
df["text"] = df["text_description"].map(lambda x: " ".join(x))

### manual clean
df["text"] = df["text"].map(lambda x: x.replace("\r\n", "").replace("\n", "").replace("\xa0", "").replace("  ", " "))

# stop words
my_stop_words = {'i', 'Opis'}

#### prepare X, y
vec = CountVectorizer(**{'max_features': 3000, 'lowercase': True})
X = vec.fit_transform(df['text']).toarray()
y = df['price_value']

In [None]:
### Featue Engineering

In [127]:
def find_keywords(df, keyword, top=50, len_before=10, len_after=30, return_all=False):
    sub_df = df[ df["text"].map(lambda x: keyword in x) ]["text"]
#    print(sub_df.shape)
    def find_substr(value):
        value = value.lower()
        keyword_idx = value.find(keyword)
        return value[keyword_idx-len_before : keyword_idx+len_after]
        
    if return_all:
        return df["text"].map(find_substr)
    
    return df["text"].map(find_substr).sample(top)


def parse_founded(sample):
    if sample != "":
        return 1 
    return 0 

In [128]:
### GARAŻ / PARKING / POSTOJOWE
df["garage"] = find_keywords(df, "garaż", top=50, len_before=0, len_after=10, return_all=True).map(parse_founded)
df["parking"] = find_keywords(df, "parking", top=50, len_before=0, len_after=10, return_all=True).map(parse_founded)
df["parking_place"] = find_keywords(df, "postojowe", top=50, len_before=0, len_after=10, return_all=True).map(parse_founded)

def get_can_park(a,b,c):
    if a or b or c:
        return 1
    return 0
df["can_park"] = df.apply(lambda x: get_can_park(x["garage"],x["parking"],x["parking_place"]),axis=1)

X_parking = df[ ["garage", "parking", "parking_place", "can_park"] ].fillna(-1)
#X_parking.shape
#X_parking.sample(10)
X_final = np.hstack([X_parking, X])

In [129]:
### AREA
#pattern = r"powierzchn[^\d]\s*[a-zżźńęóął:\.\-\,\s\(]*(\d+[.,]?\d*)\s*(m2|mkw|m.kw.|m²)?"
pattern = r"powierzchn[^\d]+([\d]+[,.]?[\d]*)\s*(m2|mkw|m.kw.)?\s*"

def try_parse_area(sample):
    match = re.match( pattern, sample, re.M|re.I)
    if match: 
        return match.group(1)
    return sample

def parse_area(sample):
    match = re.match( pattern, sample, re.M|re.I)
    if match: 
        return match.group(1)
    return -1

df["area"] = find_keywords(df, "powierzch", top=50, len_before=0, len_after=30, return_all=True).map(parse_area)
df["area"] = df["area"].str.replace(",", ".").astype("float")
df["area_log"] = np.log(df["area"])
X_areas = df[ ["area", "area_log"] ].fillna(-1)
X_final = np.hstack([X_areas, X_final])
#X_areas.shape # (29580, 2)

In [130]:
### DO REMONTU / PO REMONCIE / STAN DEWELOPERSKI / MIESZKANIE / APARTAMENT / KAMIENICA / BALKON / TARAS 
### WIND / OKNA PCV / OKNA DREWNIANE / PIWNICA / KOMÓRK / STRYCH

### FOR RENOVATION / AFTER RENOVATION / DEVELOPMENT / FLAT / APARTMENT / TENEMENT/ BALCONY / TERRACE
### LIFT / PVC WINDOWS / WOODEN WINDOWS / BASEMENT / CELL / SLOPE 
df["for_renovation"] = find_keywords(df, "do remontu", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["after_renovation"] = find_keywords(df, "po remoncie", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["developers_status"] = find_keywords(df, "stan deweloperski", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["flat"] = find_keywords(df, "mieszkanie", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["apartment"] = find_keywords(df, "apartament", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["tenement"] = find_keywords(df, "kamienica", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["balcony"] = find_keywords(df, "balkon", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["tarrace"] = find_keywords(df, "taras", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["lift"] = find_keywords(df, "wind", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["windows_pvc"] = find_keywords(df, "okna PVC", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["windows_wooden"] = find_keywords(df, "okna drewniane", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["basement"] = find_keywords(df, "piwnica", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["cell"] = find_keywords(df, "komórk", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["slope"] = find_keywords(df, "strych", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["garden"] = find_keywords(df, "ogród", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)
df["air_conditioning"] = find_keywords(df, "klima", top=50, len_before=0, len_after=20, return_all=True).map(parse_founded)

X_mono = df[ ["for_renovation", "after_renovation", "developers_status", "flat", "apartment",
             "tenement", "balcony", "tarrace", "lift", "windows_pvc", "windows_wooden", "basement", "cell", "slope", "garden", "air_conditioning"] ].fillna(-1)
#X_mono.shape
#X_mono.sample(10)
X_final = np.hstack([X_mono, X_final])

In [131]:
### NUMBER OF ROOMS
rooms_dic = {
            "1 pok": 1,
            "1-pok": 1,
            "1- pok": 1,
            "jednopokojowe": 1,
            "jeden pokój": 1,
            "jednego poko": 1,
            "2 pok": 2,
            "2-pok": 2,
            "2- pok": 2,
            "dwa pok": 2,
            "dwupoko": 2,
            "dwóch poko": 2,
            "dwóch nieprzechodnich poko": 2,
            "dwóch przechodnich poko": 2,
            "dwa niezależne poko": 2,
            "3 pok": 3,
            "3-pok": 3,
            "3- pok": 3,
            "trzy poko": 3,
            "trzypoko": 3,
            "trzech poko": 3,
            "trzech nieprzechodnich poko": 3,
            "trzech przechodnich poko": 3,
            "trzy niezależne poko": 3,
            "4 pok": 4,
            "4-pok": 4,
            "4- pok": 4,
            "cztery poko": 4,
            "czteropoko": 4,
            "czterech poko": 4,
            "czterech nieprzechodnich poko": 4,
            "czterech przechodnich poko": 4,
            "cztery niezależne poko": 4,
            "5 pok": 5,
            "5-pok": 5,
            "5- pok": 5,
            "pięć poko": 5,
            "pięciopoko": 5,
            "pięciu poko": 5,
            "pięciu nieprzechodnich poko": 5,
            "pięciu przechodnich poko": 5,
            "pięć niezależnych poko": 5,
            "6 pok": 6,
            "6-pok": 6,
            "6- pok": 6,
            "sześć poko": 6,
            "sześciu poko": 6,
            "sześciopoko": 6,
            "sześć niezależnych poko": 6
            }

def find_rooms(value):
        
    value = value.lower()
        
    for keyword in rooms_dic: 
        if (value.find(keyword) >= 0):
            return int(rooms_dic[keyword])
    
    return -1
 
df["rooms"] = df.apply(lambda x: find_rooms(x["text"]),axis=1)

X_rooms = df[ ["rooms"] ].fillna(-1)
X_final = np.hstack([X_rooms, X_final])

#for k in rooms_dic:
#    print(k+" ==> "+str(rooms_dic[k]))

In [132]:
### PRICE_IN_TEXT
pattern_price = r"cena[^\d]+([\d]+[,.\s]?[\d]*)\s*(\s|zł|pln)"

def try_parse_price(sample):
    match = re.match( pattern_price, sample, re.M|re.I)
    if match: 
        return match.group(1)
    return sample

def parse_price(sample):
    match = re.match( pattern_price, sample, re.M|re.I)
    if match: 
        return match.group(1)
    return -1


#find_keywords(df, "cena", top=50, len_before=0, len_after=50, return_all=True).map(parse_price).str.replace(",", "").str.replace(".", "").str.replace(" ", "").astype("float")

df["price_in_text"] = find_keywords(df, "cena", top=50, len_before=0, len_after=50, return_all=True).map(parse_price)
df["price_in_text"] = df["price_in_text"].str.replace(",", "").str.replace(".", "").str.replace(" ", "")
df["price_in_text"] = df["price_in_text"].astype("float")
df["price_in_text"] = df["price_in_text"].apply(lambda x: x*1000 if x<1000 else x) # 220 tys. support
df["price_in_text_log"] = np.log(df["price_in_text"])
X_price = df[ ["price_in_text", "price_in_text_log"] ].fillna(-1)
X_final = np.hstack([X_price, X_final])
#df.loc[[13210]]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [133]:
### RENT
pattern_rent = r"czynsz[^\d]+([\d]+[,.\s]?[\d]*)\s*(\s|zł|pln)"

def try_parse_rent(sample):
    match = re.match( pattern_rent, sample, re.M|re.I)
    if match: 
        return match.group(1)
    return sample

def parse_rent(sample):
    match = re.match( pattern_rent, sample, re.M|re.I)
    if match: 
        return match.group(1)
    return -1


df["rent"] = find_keywords(df, "czynsz", top=50, len_before=0, len_after=50, return_all=True).map(parse_rent)
df["rent"] = df["rent"].str.replace(",", "").str.replace(".", "").str.replace(" ", "")
df["rent"] = df["rent"].astype("float")
df["rent_log"] = np.log(df["rent"])
X_rent = df[ ["rent", "rent_log"] ].fillna(-1)
X_final = np.hstack([X_rent, X_final])
#df.loc[[13210]]

In [104]:
### save data to CSV
pd.set_option('display.max_colwidth',None)
df[["text"]].head(10)
df[ ['text'] ].to_csv('../output/data_text.csv', index=False)

In [135]:
X_final.shape

(29580, 3027)

In [136]:
X_train, y_train = X_final[ y.notnull()], y[ y.notnull()]
X_test, y_test = X_final[ y.isnull()], y[ y.isnull()]

In [137]:
X_final

array([[ 3.00000000e+02,  5.70378247e+00, -1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.00000000e+02,  5.29831737e+00, -1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.00000000e+00, -1.00000000e+00, -1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-1.00000000e+00, -1.00000000e+00,  2.95000000e+05, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-1.00000000e+00, -1.00000000e+00, -1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.00000000e+00, -1.00000000e+00, -1.00000000e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00]])

In [138]:
# local score
model = xgb.XGBRegressor(n_estimators=1000, max_depth=5, random_state=0)
scores = cross_val_score(model, X_train, y_train, cv=3, scoring="neg_mean_absolute_error")
print("local score: ", np.around(np.mean(scores), 2), np.around(np.std(scores), 2))

# xgb (ne=1000,md=5) local score: -44138.71 # CountVectorizer(mf:3000}) 15m 50s F: area + parking + ...mono... + rooms + price_in_text
# na Kaglu: 42746.08

local score:  -44138.71 161.24


In [139]:
## train for predict
model.fit(X_train, y_train)
df_test['price_value'] = model.predict(X_test)
df_test[ ['id', 'price_value'] ].to_csv('../output/starter_1_jm_v3.csv', index=False)
# time 10m