In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import re

# load data

In [None]:
df_train = pd.ExcelFile('crawl_data/train_data.xlsx', encoding= 'utf-8').parse('Sheet1')
df_test = pd.ExcelFile('crawl_data/test_data.xlsx', encoding= 'utf-8').parse('Sheet1')
df_train.head(2)

In [None]:
# preprocesing data
def check_mobile_number(phone_number):
    '''
    input: (str) the mobile number
    output: bool
    '''
    if (len(phone_number) == 9):
        #phone_number = str(phone_number)
        # các đầu số di động của các nhà mạng ở Việt Nam 
        dau_so_di_dong = ['89', '90', '93', '70', '79', '77', '76', '78', '96', '97', '98', '86', '32', 
                      '33', '34', '35', '36', '37', '38', '39', '88', '91', '94', '81', '82', '83', 
                      '84', '85', '92', '56', '58', '99', '19', '52', '59', '87', '95', '71', '72', '74', '75']
        if phone_number[:2] in dau_so_di_dong:
            return 1
        else:
            return 0
    else:
        return 0
df_train['check'] = df_train.phone_number.astype('str').apply(check_mobile_number)
df_test['check'] = df_test.phone_number.astype('str').apply(check_mobile_number)

# clean data
df_test = df_test[df_test.check == 1]
df_test = df_test.reset_index()
df_test = df_test.drop(['index', 'id_phone_test', 'check'], axis= 1)
df_test = df_test.reset_index()
df_test.columns = ['id_test_p', 'price', 'phone_number', 'target']


df_train = df_train[df_train.check == 1]
df_train = df_train.reset_index()
df_train = df_train.drop(['index', 'id_phone_train', 'check'], axis= 1)
df_train = df_train.reset_index()
df_train.columns = ['id_train_p', 'price', 'phone_number', 'target']
df_train.shape

In [None]:
# triggram
# triggram = ['0'+i for i in biggram] + [str(i) for i in list(np.arange(100,1000))]
# df_triggram = pd.DataFrame()
# df_triggram['num_phrases'] = triggram
# df_triggram = df_triggram.reset_index()
# df_triggram.columns = ['id', 'num_phrases']

In [None]:
# n-gram
def generate_ngrams(text, n_gram= 2):
    text = str(text)
    token = [str(i) for i in text]
    #print(token)
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    #print(ngrams)
    ngrams1 = "|".join([" ".join(ngram) for ngram in ngrams])
    ngrams1 = re.sub('\s+', '', ngrams1)
    return ngrams1.strip()

In [None]:
# create sparse matrix
def create_sparse_mt(x_values, x_u, x_i, n_row, n_col):
    return sparse.csr_matrix((x_values,(x_u,x_i)),(n_row,n_col))

# create the index for the num_phrases

In [None]:
# create index for the num_phrases
# bộ 6 số cuối đẹp 
luc = [str(i)*6 for i in range(0,10)]
tang_dan_6 = generate_ngrams('0123456789', n_gram= 6).split('|')
lap_6 = ['ABABAB']

# bộ 5 số cuối đẹp 
ngu = [str(i)*5 for i in range(0,10)]
tang_dan_5 = generate_ngrams('0123456789', n_gram= 5).split('|')
ganh_5 = ['ABCAB']

# bộ 4 số cuối đẹp 
tu = [str(i)*4 for i in range(0,10)]
tang_dan_4 = generate_ngrams('0123456789', n_gram= 4).split('|')
lap_4 = ['ABAB']

# bộ 3 số cuối đẹp
tam = [str(i)*3 for i in range(0,10)]
tang_dan_3 = generate_ngrams('0123456789', n_gram= 3).split('|')
ganh_3 = ['ABA']

# bộ 2 số cuối đẹp 
nhi = ['06', '09', '10', '12', '16', '17', '18', '19', '26', '28', '33',
   '36', '37', '38', '39', '40', '40', '46', '50', '52', '53', '56',
   '57', '66', '66', '68', '77', '78', '79', '80', '80', '83', '86',
   '86', '88', '89', '90', '92', '97']
nhi = [i + 'z' for i in nhi]

# bộ đầu số các nhà mạng
dau_so = ['89', '90', '93', '70', '79', '77', '76', '78', '96', '97', '98', '86', '32', 
                  '33', '34', '35', '36', '37', '38', '39', '88', '91', '94', '81', '82', '83', 
                  '84', '85', '92', '56', '58', '99', '19', '52', '59', '87', '95', '71', '72', '74', '75']
dau_so = [i+'a' for i in dau_so]

# biggram
biggram = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09'] + [str(i) for i in list(np.arange(10,100))]

num_phrases = dau_so + biggram + nhi + luc + tang_dan_6 + lap_6 + ngu + tang_dan_5 + ganh_5 + tu + tang_dan_4 + lap_4 + tam + tang_dan_3 + ganh_3
df_num = pd.DataFrame()
df_num['num_phrases'] = num_phrases
df_num_phrases = df_num.reset_index()
df_num_phrases.columns = ['id', 'num_phrases']

In [None]:
# writer = pd.ExcelWriter('index_num_phrases_ver3.xlsx', engine='xlsxwriter')
# df_num_phrases.to_excel(writer, 'Sheet1', index= False, encoding= 'utf-8')
# writer.save()

# feature 

In [None]:
# extract feature
def extract_feature(idx, phone_number):
    '''
    input: (str) phone number
    '''
    if phone_number[0] == '0':
            phone= phone_number[1:]
    else:
        phone = phone_number
    # kiểm tra số điện thoại
    if check_mobile_number(phone)== 0:
        return 0
    
     
    # create a DataFrame, contain the each mobile number infor
    df = pd.DataFrame()
    # thực hiện biggram cho 7 số cuối của chuỗi số điện thoại 
    df['num_phrases'] = generate_ngrams(phone[2:], n_gram= 2).split('|')
    df = df.num_phrases.value_counts().to_frame()
    df = df.reset_index()
    df.columns = ['num_phrases', 'count']
    
    # xét 2 số đầu tiên của dãy số điện thoại 
    
    so_dau = {'num_phrases':phone[:2]+'a', 'count': 5}
    # insert new row into dataframe 
    df = df.append(so_dau, ignore_index= True)
    
    # xét 2 số cuối của dãy số điện thoại(trường hợp riêng xét độc lập với trường hợp các bộ số đặc biệt)
    # bộ 2 số cuối đẹp 
    hai_so_cuoi  = ['06', '09', '10', '12', '16', '17', '18', '19', '26', '28', '33',
       '36', '37', '38', '39', '40', '40', '46', '50', '52', '53', '56',
       '57', '66', '66', '68', '77', '78', '79', '80', '80', '83', '86',
       '86', '88', '89', '90', '92', '97']
    if phone[-2:] in hai_so_cuoi:
        so_cuoi_2 = {'num_phrases': phone[-2:]+'z', 'count': 3}
        df = df.append(so_cuoi_2, ignore_index= True)
    
    # xét các bộ số đặc biệt trong chuỗi 7 số cuối 
        # bộ 6 số cuối đẹp 
    luc = [str(i)*6 for i in range(0,10)]
    tang_dan_6 = generate_ngrams('0123456789', n_gram= 6).split('|')
        # bộ 5 số cuối đẹp 
    ngu = [str(i)*5 for i in range(0,10)]
    tang_dan_5 = generate_ngrams('0123456789', n_gram= 5).split('|')
        # bộ 4 số cuối đẹp 
    tu = [str(i)*4 for i in range(0,10)]
    tang_dan_4 = generate_ngrams('0123456789', n_gram= 4).split('|')
        # bộ 3 số cuối đẹp
    tam = [str(i)*3 for i in range(0,10)]
    tang_dan_3 = generate_ngrams('0123456789', n_gram= 3).split('|')
    
    # bộ 6 số 
        # nếu 6 số cuối là bộ số luc: 000000, ..., 999999
    if phone[-6:] in luc:
        so_cuoi = {'num_phrases': phone[-6:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        # 012345, ..., 456789
    elif phone[-6:] in tang_dan_6:
        so_cuoi = {'num_phrases': phone[-6:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        # cac bo so lap 6: ABABAB
    elif (phone[-6:-4] == phone[-4:-2]) and (phone[-6:-4] == phone[-2:]):
        so_cuoi = {'num_phrases': 'ABABAB', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        
    # bộ 5 số cuối 
        # nếu 5 số cuối là bộ số luc: 00000, ..., 99999
    elif phone[-5:] in ngu:
        so_cuoi = {'num_phrases': phone[-5:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        # 01234, ..., 56789
    elif phone[-5:] in tang_dan_5:
        so_cuoi = {'num_phrases': phone[-5:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        # cac bo so ganh 5: ABCAB
    elif (phone[-5:-3] == phone[-2:]):
        so_cuoi = {'num_phrases': 'ABCAB', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        
    # bộ 4 số cuối 
         # nếu 4 số cuối là bộ số 4 chữ số: 0000, ..., 999
    elif phone[-4:] in tu:
        so_cuoi = {'num_phrases': phone[-4:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        # 01234, ..., 56789
    elif phone[-4:] in tang_dan_4:
        so_cuoi = {'num_phrases': phone[-4:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        # cac bo so lap 4: ABAB
    elif (phone[-4:-2] == phone[-2:]):
        so_cuoi = {'num_phrases': 'ABAB', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
    
    # bộ 3 số cuối 
        # nếu 3 số cuối là bộ số luc: 000, ..., 999
    elif phone[-3:] in tam:
        so_cuoi = {'num_phrases': phone[-3:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        # 012, ..., 789
    elif phone[-3:] in tang_dan_3:
        so_cuoi = {'num_phrases': phone[-3:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

        # cac bo so ganh 5: ABCAB
    elif (phone[-3] == phone[-1]):
        so_cuoi = {'num_phrases': 'ABA', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)

    df['id'] = idx
    df = df[['id', 'num_phrases', 'count']]
    return df

In [None]:
import time
t1 = time.time()
extract_feature(0, '0399946756')
t2 = time.time()
t2-t1

In [None]:
df_test.head(1)

In [None]:
ls_data_train = []
for i in range(df_train.shape[0]):
    ls_data_train.append(extract_feature(int(df_train.loc[i, 'id_train_p']), str(df_train.loc[i, 'phone_number'])))

In [None]:
ls_data_test = []
for i in range(df_test.shape[0]):
    ls_data_test.append(extract_feature(int(df_test.loc[i, 'id_test_p']), str(df_test.loc[i, 'phone_number'])))

In [None]:
df_feature_train = pd.concat(ls_data_train, axis = 0)
df_feature_test = pd.concat(ls_data_test, axis= 0)

In [None]:
df_feature_train = df_feature_train.reset_index()
df_feature_train = df_feature_train.drop('index', axis= 1)

In [None]:
df_feature_test = df_feature_train.reset_index()
df_feature_test = df_feature_test.drop('index', axis= 1)

In [None]:
df_num_phrases['num_phrases'] = df_num_phrases.num_phrases.astype('str')
df_feature_test.num_phrases = df_feature_test.num_phrases.astype('str')
df_feature_train.num_phrases = df_feature_train.num_phrases.astype('str')
# merge index of the biggram and triggram phrases
df_feature_test = pd.merge(df_feature_test, df_num_phrases, on= 'num_phrases', how= 'left')
df_feature_train = pd.merge(df_feature_train, df_num_phrases, on= 'num_phrases', how= 'left')

In [None]:
# for i in range(df_feature_train.shape[0]):
#     print('index: ',i)
#     print(int(df_feature_train.loc[i, 'id_num']))

In [None]:
df_feature_test = df_feature_test[['id_x', 'id_y', 'count', 'num_phrases']]
df_feature_test['id_x'] = df_feature_test['id_x'].astype('int')
df_feature_test.id_y = df_feature_test.id_y.astype('int')
df_feature_test.columns = ['id', 'id_num', 'count', 'num_phrases']

df_feature_train = df_feature_train[['id_x', 'id_y', 'count', 'num_phrases']]
df_feature_train['id_x'] = df_feature_train['id_x'].astype('int')
df_feature_train.id_y = df_feature_train.id_y.astype(int)
df_feature_train.columns = ['id', 'id_num', 'count', 'num_phrases']

In [None]:
# writer = pd.ExcelWriter('feature_train.xlsx', engine='xlsxwriter')
# df_feature_train[['id', 'id_num', 'count']].to_excel(writer, 'Sheet1')
# writer.save()

# writer1 = pd.ExcelWriter('feature_test.xlsx', engine='xlsxwriter')
# df_feature_test[['id', 'id_num', 'count']].to_excel(writer1, 'Sheet1')
# writer1.save()


In [None]:
feature_train = df_feature_train[['id', 'id_num', 'count']].as_matrix()
feature_test = df_feature_test[['id', 'id_num', 'count']].as_matrix()

In [None]:
feature_train[-2:]

In [None]:
n_row_train = max(df_feature_train['id']) + 1
n_row_test = max(df_feature_test['id']) + 1
n_col = max(df_feature_train['id_num']) + 1
print(n_row_train)
print(n_row_test)
print(n_col)

In [None]:
X_train_sparse_csr = sparse.csr_matrix((feature_train[:, 2],(feature_train[:,0],feature_train[:,1])),(n_row_train,n_col))
X_test_sparse_csr = sparse.csr_matrix((feature_test[:, 2], (feature_test[:, 0], feature_test[:, 1])),(n_row_test, n_col))

In [None]:
X_train_sparse_csr

In [None]:
# with open('X_train_sparse_csr_ver1.pkl','wb') as f:
#     pickle.dump(X_train_sparse_csr,f)

In [None]:
def Similar( Y_chall_sparse_csr, Y_train_sparse_csr):
    # cosine ( (i x u1).T . ( i x u2).T ) = u1 x u2 : uuCF
    # using cosine compute similarity between challenge_set ( 118553x1100) and training_set ( 474208x1100)
    
    S = cosine_similarity(Y_chall_sparse_csr,Y_train_sparse_csr) # (118553 x 474208)
    return S # 118553 x 474208

In [None]:
S = Similar(X_test_sparse_csr[0], X_train_sparse_csr)

In [None]:
def predict_price(id_phone,  Similar, k1):
    # tính độ đương đồng của id_phone với tất cả các id trong tập train
    # vector giá trị tương đồng 
    similar_vec = Similar(X_test_sparse_csr[id_phone], X_train_sparse_csr)
    # lấy k1 id trong tập training có độ tương đồng lớn nhất với id_phone
    # take k1 user the most similarity with user : u_id
    k1_u_similar = np.argsort(similar_vec[0])[-k1: ]
    
    # từ k1 id trong tập train tương đồng này ta tính giá tiền trung bình của các id này
    pred_price = np.mean(df_train[df_train.id_train_p.isin(k1_u_similar)].price.values)
    
    # tạo bảng dữ liệu chứa thông tin số điện thoại được dự đoán giá tiền
    df = pd.DataFrame()
    df['id'] = [id_phone]
    df['phone_number'] = [df_test.loc[id_phone, 'phone_number']]
    df['price_ori'] = [df_test.loc[id_phone, 'price']]
    df['price_predict'] = [pred_price]
    del similar_vec
    return df

In [None]:
d = predict_price(1, Similar, 10)

In [None]:
d

In [None]:
d.loc[0, 'price_predict']/d.loc[0, 'price_ori']

In [None]:
ls_pred = []
for i in range(df_test.shape[0]):
    ls_pred.append(predict_price(i, Similar, 10))

In [None]:
def accuracy(ratio):
    if (ratio >= 0.9) & (ratio <= 1.1):
        return 1
    else:
        return 0
def evualuation(ls_pred):
    df_pred = pd.concat(ls_pred, axis= 0)
    df_pred['ratio'] = df_pred.price_predict/df_pred.price_ori
    df_pred['acc'] = df_pred.ratio.apply(accuracy)
    return df_pred

In [None]:
df_p = evualuation(ls_pred)
df_p.acc.value_counts()

In [None]:
23507/df_p.shape[0]

In [None]:
ls_pred = []
for i in range(df_test.shape[0]):
    ls_pred.append(predict_price(i, Similar, 15))

In [None]:
df_p = evualuation(ls_pred)
df_p.acc.value_counts()

In [None]:
ls_pred = []
for i in range(df_test.shape[0]):
    ls_pred.append(predict_price(i, Similar, 20))

In [None]:
df_p = evualuation(ls_pred)
df_p.acc.value_counts()

In [None]:
ls_pred = []
for i in range(df_test.shape[0]):
    ls_pred.append(predict_price(i, Similar, 25))

In [None]:
df_p = evualuation(ls_pred)
df_p.acc.value_counts()

In [None]:
ls_pred = []
for i in range(df_test.shape[0]):
    ls_pred.append(predict_price(i, Similar, 50))

In [None]:
df_p = evualuation(ls_pred)
df_p.acc.value_counts()

In [None]:
ls_pred = []
for i in range(df_test.shape[0]):
    ls_pred.append(predict_price(i, Similar, 100))

In [None]:
df_p = evualuation(ls_pred)
df_p.acc.value_counts()

In [None]:
ls_pred = []
for i in range(df_test.shape[0]):
    ls_pred.append(predict_price(i, Similar, 5))

In [None]:
df_p = evualuation(ls_pred)
df_p.acc.value_counts()