In [1]:
import joblib
import pandas as pd
import numpy as np
import re
from scipy import sparse

In [2]:
# normalize mobile number
def norm_mobile_number(mobile_number):
    #  create a dictionary, convert dau so di dong
    convert_dau_so = {'162' : '32',
                    '163' : '33',
                    '164' : '34',
                    '165' : '35',
                    '166' : '36',
                    '167' : '37',
                    '168' : '38',
                    '169' : '39',
                    '120' : '70',
                    '121' : '79',
                    '122' : '77',
                    '126' : '76',
                    '128' : '78',
                    '123' : '83',
                    '124' : '84',
                    '125' : '85',
                    '127' : '81',
                    '129' :'82',
                    '199' : '59'}
    '''normalize mobile number (str)'''
    if mobile_number[0] == '0':
        mobile = mobile_number[1:]
    else:
        mobile = mobile_number
    if mobile[:3] in convert_dau_so.keys():
        mobile = convert_dau_so[mobile[:3]] + mobile[3:]
    return mobile

def check_mobile_number(mobile_number):
    '''
    input: (str) the mobile number
    output: bool
    '''
    if (len(mobile_number) == 9):
        #phone_number = str(phone_number)
        # các đầu số di động của các nhà mạng ở Việt Nam 
        dau_so_di_dong = ['89', '90', '93', '70', '79', '77', '76', '78', '96', '97', '98', '86', '31', '32', 
                      '33', '34', '35', '36', '37', '38', '39', '88', '91', '94', '80', '81', '82', '83', 
                      '84', '85', '92', '56', '57', '58', '99', '19', '52', '59', '87', '95', '71', '72', 
                        '74','75']
        if mobile_number[:2] in dau_so_di_dong:
            return 1
        else:
            
            return 0
    else:
        return 0
    
# n-gram
def generate_ngrams(text, n_gram= 2):
    text = str(text)
    token = [str(i) for i in text]
    #print(token)
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    #print(ngrams)
    ngrams1 = "|".join([" ".join(ngram) for ngram in ngrams])
    ngrams1 = re.sub('\s+', '', ngrams1)
    return ngrams1.strip()

In [3]:
# extract feature
def extract_feature_mobile_number(mobile):
    '''
    input: (str) mobile number
    '''
    if check_mobile_number(mobile)== 0:
        return -1
    # create a DataFrame, contain the each mobile number infor
    df = pd.DataFrame()
    # thực hiện biggram cho 7 số cuối của chuỗi số điện thoại 
    df['num_phrases'] = generate_ngrams(mobile[2:], n_gram= 2).split('|')
    df = df.num_phrases.value_counts().to_frame()
    df = df.reset_index()
    df.columns = ['num_phrases', 'count']
    
    # xét 2 số đầu tiên của dãy số điện thoại 's': start
    so_dau = {'num_phrases':mobile[:2]+'s', 'count': 5}
    # -insert new row into dataframe 
    df = df.append(so_dau, ignore_index= True)
    
    # xét 2 số cuối của dãy số điện thoại(trường hợp riêng xét độc lập với trường hợp các bộ số đặc biệt)
    # bộ 2 số cuối đẹp 
    hai_so_cuoi  = ['06', '09', '10', '12', '16', '17', '18', '19', '26', '28', '33',
       '36', '37', '38', '39', '40', '40', '46', '50', '52', '53', '56',
       '57', '66', '66', '68', '77', '78', '79', '80', '80', '83', '86',
       '86', '88', '89', '90', '92', '97']
    if mobile[-2:] in hai_so_cuoi:
        # 'e': end
        so_cuoi_2 = {'num_phrases': mobile[-2:]+'e', 'count': 3}
        df = df.append(so_cuoi_2, ignore_index= True)
    
    # xét các bộ số đặc biệt trong chuỗi 7 số cuối 
    # bộ 6 số  
        # 000000, ..., 999999
    luc = [str(i)*6 for i in range(0,10)] 
        # 012345, ..., 456789
    so_tang6 = generate_ngrams('0123456789', n_gram= 6).split('|')
        # 987654, ..., 543210
    so_giam6 = generate_ngrams('9876543210', n_gram= 6).split('|')

    # bộ 5 số
        # bộ 5 số cuối
    ngu = [str(i)*5 for i in range(0,10)]
    so_tang5 = generate_ngrams('0123456789', n_gram= 5).split('|')
    so_giam5 = generate_ngrams('9876543210', n_gram= 5).split('|')

    # bộ 4 số  
        # bộ 4 số cuối
    tu = [str(i)*4 for i in range(0,10)]
    so_tang4 = generate_ngrams('0123456789', n_gram= 4).split('|')
    so_giam4 = generate_ngrams('9876543210', n_gram= 4).split('|')

    # bộ 3 số
        # bộ 3 số cuối
    tam = [str(i)*3 for i in range(0,10)]
    so_tang3 = generate_ngrams('0123456789', n_gram= 3).split('|')
    so_giam3 = generate_ngrams('9876543210', n_gram= 3).split('|')
    ganh3 = ['ABA']

    # bộ 6 số giữa và cuoi
        # nếu 6 số cuối là bộ số luc: 000000, ..., 999999
    if mobile[-6:] in luc:
        so_cuoi = {'num_phrases': mobile[-6:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # nếu 6 số giữa là bộ số luc: 000000, ..., 999999
        if mobile[2:-1] in luc:
            # 'b': between
            so_giua = {'num_phrases': mobile[2:-1]+ 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        
        # 012345, ..., 456789
    elif mobile[-6:] in so_tang6:
        so_cuoi = {'num_phrases': mobile[-6:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # 012345, ..., 456789, 'b': between
        if mobile[2:-1] in so_tang6:
            so_giua = {'num_phrases': mobile[2:-1] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        # 987654, ..., 543210
    elif mobile[-6:] in so_giam6:
        so_cuoi = {'num_phrases': mobile[-6:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
         # 987654, ..., 543210, 'b': between
        if mobile[2:-1] in so_giam6:
            so_giua = {'num_phrases': mobile[2:-1] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        # cum so tang dan: ABAB+1AB+2
    elif(int(mobile[-6:-4]) == int(mobile[-4:-2]) - 1) and (int(mobile[-6:-4]) == int(mobile[-2:]) - 2):
        so_cuoi = {'num_phrases': 'ABAB+1AB+2', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # cum so giam dan: ABAB-1AB-2
    elif(int(mobile[-6:-4]) == int(mobile[-4:-2]) + 1) and (int(mobile[-6:-4]) == int(mobile[-2:]) + 2):
        so_cuoi = {'num_phrases': 'ABAB-1AB-2', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # cac bo so lap 6: ABABAB
    elif (mobile[-6:-4] == mobile[-4:-2]) and (mobile[-6:-4] == mobile[-2:]):
        so_cuoi = {'num_phrases': 'ABABAB', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # cac bo so lap 6: ABABAB
        if (mobile[2:4] == mobile[4:6]) and (mobile[2:4] == mobile[6:8]):
            so_giua = {'num_phrases': 'ABABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        
        
    # bộ 5 số cuối 
        # nếu 5 số cuối là bộ số luc: 00000, ..., 99999
    elif mobile[-5:] in ngu:
        so_cuoi = {'num_phrases': mobile[-5:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum so giua
        if mobile[2:5] in so_tang3:
            so_giua = {'num_phrases': mobile[2:5] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        # 01234, ..., 56789
    elif mobile[-5:] in so_tang5:
        so_cuoi = {'num_phrases': mobile[-5:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum so giua
        if mobile[2:5] in tam:
            so_giua = {'num_phrases': mobile[2:5] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        # 98765, ..., 43210
    elif mobile[-5:] in so_giam5:
        so_cuoi = {'num_phrases': mobile[-5:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum so giua
        if mobile[2:5] in tam:
            so_giua = {'num_phrases': mobile[2:5] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        # cac bo so ganh 5: ABCAB
    elif (mobile[-5:-3] == mobile[-2:]):
        so_cuoi = {'num_phrases': 'ABCAB', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua 
        if mobile[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif mobile[2:4] == mobile[4:6]:
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        # xe cum 4
        else:
            ls_num = so_tang5+ so_giam5 + tu + so_tang4 + so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:6]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
        
    # bộ 4 số cuối 
         # nếu 4 số cuối là bộ số 4 chữ số: 0000, ..., 999
    elif mobile[-4:] in tu:
        so_cuoi = {'num_phrases': mobile[-4:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:4] == mobile[4:6]:
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num =so_tang4 + so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:6]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
        # 01234, ..., 56789
    elif mobile[-4:] in so_tang4:
        so_cuoi = {'num_phrases': mobile[-4:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:4] == mobile[4:6]) or (mobile[4:6] == mobile[6:-1]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num = so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:6]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
        # 01234, ..., 56789
    elif mobile[-4:] in so_giam4:
        so_cuoi = {'num_phrases': mobile[-4:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:4] == mobile[4:6]) or (mobile[4:6] == mobile[6:-1]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num =  so_tang4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:6]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
        # cum so tang 'ABAB+1'
    elif(int(mobile[-4:-2]) ==  int(mobile[-2:]) - 1):
        so_cuoi = {'num_phrases': 'ABAB+1', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:4] == mobile[4:6]) or (mobile[4:6] == mobile[6:-1]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num = so_tang5 + so_giam5+ so_tang4 + so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:-1]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
        # cum so giam 'ABAB-1'
    elif(int(mobile[-4:-2]) ==  int(mobile[-2:]) + 1):
        so_cuoi = {'num_phrases': 'ABAB-1', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:4] == mobile[4:6]) or (mobile[4:6] == mobile[6:-1]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num = so_tang5 + so_giam5 + so_tang4 + so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:-1]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
        # cac bo so lap 4: ABAB
    elif (mobile[-4:-2] == mobile[-2:]):
        so_cuoi = {'num_phrases': 'ABAB', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:4] == mobile[4:6]) or (mobile[4:6] == mobile[6:-1]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num = so_tang5 + so_giam5 + so_tang4 + so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:-1]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
    
    # bộ 3 số cuối 
        # nếu 3 số cuối là bộ số luc: 000, ..., 999
    elif mobile[-3:] in tam:
        so_cuoi = {'num_phrases': mobile[-3:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:4] == mobile[4:6]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num = so_tang5 + so_giam5 + so_tang4 + so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:7]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break

        # 012, ..., 789
    elif mobile[-3:] in so_tang3:
        so_cuoi = {'num_phrases': mobile[-3:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:7] in ngu:
            so_giua = {'num_phrases': mobile[2:7] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif mobile[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:4] == mobile[4:6]) or (mobile[4:6] == mobile[6:-1]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num = so_tang5 + so_giam5 + so_tang4 + so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:8]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
    elif mobile[-3:] in so_giam3:
        so_cuoi = {'num_phrases': mobile[-3:], 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:7] in ngu:
            so_giua = {'num_phrases': mobile[2:7] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif phone[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:4] == mobile[4:6]) or (mobile[4:6] == mobile[6:-1]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num = so_tang5 + so_giam5 + so_tang4 + so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:8]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
        # cac bo so ganh 5: ABCAB
    elif (mobile[-3] == mobile[-1]):
        so_cuoi = {'num_phrases': 'ABA', 'count': 5}
        df = df.append(so_cuoi, ignore_index= True)
        # xet cum giua
        if mobile[2:7] in ngu:
            so_giua = {'num_phrases': mobile[2:7] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif mobile[2:6] in tu:
            so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:4] == mobile[4:6]) or (mobile[4:6] == mobile[6:-1]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num = so_tang5 + so_giam5 + so_tang4 + so_giam4 + tam + so_tang3
            for num in ls_num:
                if num in mobile[2:8]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
    # xét cụm 6 số ở giữa. khi các số cuối không đặc biệt
    else:
        if mobile[2:8] in luc:
            so_giua = {'num_phrases': mobile[2:8] + 'b', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:7] in ngu) or (mobile[3:8] in ngu):
            if mobile[2:7] in ngu:
                so_giua = {'num_phrases': mobile[2:7] + 'b', 'count': 5}
                df = df.append(so_giua, ignore_index= True)
            elif mobile[3:8] in ngu:
                so_giua = {'num_phrases': mobile[3:8] + 'b', 'count': 5}
                df = df.append(so_giua, ignore_index= True)
        elif (mobile[2:6] in tu) or (mobile[3:7] in tu) or (mobile[4:8] in tu):
            if mobile[2:6] in tu:
                so_giua = {'num_phrases': mobile[2:6] + 'b', 'count': 5}
                df = df.append(so_giua, ignore_index= True)
            elif mobile[3:7] in tu:
                so_giua = {'num_phrases': mobile[3:7] + 'b', 'count': 5}
                df = df.append(so_giua, ignore_index= True)
            elif mobile[4:8] in tu:
                so_giua = {'num_phrases': mobile[4:8] + 'b', 'count': 5}
                df = df.append(so_giua, ignore_index= True)
        # cac bo so lap 6: ABABAB
        elif (mobile[2:4] == mobile[4:6]) and (mobile[2:4] == mobile[6:8]):
            so_giua = {'num_phrases': 'ABABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        # cac bo so lap 6: ABAB
        elif (mobile[2:4] == mobile[4:6]) or (mobile[4:6] == mobile[6:8]) or (mobile[3:5] == mobile[5:7]):
            so_giua = {'num_phrases': 'ABABb', 'count': 5}
            df = df.append(so_giua, ignore_index= True)
        else:
            ls_num = so_tang6+ so_tang6+ so_tang5+ so_giam5+ so_tang4+ so_giam4+ tam+ so_tang3
            for num in ls_num:
                if num in mobile[2:8]:
                    so_giua = {'num_phrases': num + 'b', 'count': 5}
                    df = df.append(so_giua, ignore_index= True)
                    break
        

        
    df_num_phrases = pd.ExcelFile('feature_phrases.xlsx', encoding= 'utf-8').parse('Sheet1')
    df['id'] = 0
    df = df[['id', 'num_phrases', 'count']]
    df = pd.merge(df, df_num_phrases, on= 'num_phrases', how= 'left')
    n_col = df_num_phrases.shape[0]
    # feature vector
    x = sparse.csr_matrix((df['count'],(df['id_x'],df['id_y'])),(1,n_col))
    return x

In [4]:
class mobile_price():
    def __init__(self, model_file):
        self.model_file = model_file
        # load model
        self.model = joblib.load(self.model_file)        

    def predict_price(self, mobile_number):
        mobile = norm_mobile_number(mobile_number)
        # kiểm tra số điện thoại
        if check_mobile_number(mobile)== 0:
            return -1
        
        self.vec_x = extract_feature_mobile_number(mobile)
        # target predict
        self.y_pred = self.model.predict(self.vec_x)[-1]
        self.result = dict()
        self.result['target']= self.y_pred
        if self.y_pred == 0:
            self.result['price'] = '< 200.000 VND'
        elif self.y_pred == 1:
            self.result['price'] = '200.000 - 499.000 VND'
        elif self.y_pred == 2:
            self.result['price'] = '500.000 - 999.000 VND'
        elif self.y_pred == 3:
            self.result['price'] = '1.000.000 - 2.999.000 VND'
        elif self.y_pred == 4:
            self.result['price'] = '3.000.000 - 9.999.000 VND'
        elif self.y_pred == 5:
            self.result['price'] = '10.000.000 - 49.999.000 VND'
        elif self.y_pred == 6:
            self.result['price'] = '>= 50.000.000 VND'
        else:
            self.result['price'] = ''
        return self.result    

In [5]:
pre = mobile_price('model/random_forest_ver2.pkl')



In [12]:
pre.predict_price('0973878786')

{'target': 3, 'price': '1.000.000 - 2.999.000 VND'}