In [None]:
## Script Index: S3
## Author: Liu Yue
## Date: 03/10/2016
## Python Version: Anaconda 4.2.0, python 3.5

""" 
    This script matches the names of device models on ZOL and on TalkingData
"""

In [3]:
""" Methods for name tokenisation"""

import re

def rename(s, brand_name):
    """ Method for name tokenizasion, with steps explained """
    # Step 1: change lower to UPPER
    s = s.upper()
    # Step 2: remove brand name 
    s = s.replace(brand_name, '')
    # Step 3: synonyms
    s = s.replace('+',' PLUS')
    s.replace('TAB','') 
    s = s.replace('III', '3')
    s = s.replace('II', '2')
    s = s.replace('MI', '')
    s = s.replace('.0', '')
    # Step 4: replace meaningless punctuations
    random_char = "_.-?!@/#$)(（）"
    for char in random_char:
        s = s.replace(char,' ')    
    # Step 5 seperate into tokens (Chinese, [0-9], [a-zA-Z], others) and remove empty spaces
    lst = str2List(s)
    token_set = set([token for token in lst if token != ' '])
    return(token_set)

def char_type(uchar):
    """Method to get the type of character: Chinese, 0-9, a-zA-Z, other"""
    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
        return('is_chinese') 
    if uchar >= u'\u0030' and uchar<=u'\u0039':
        return('is_number')
    if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
        return ('is_alphabet')
    else:
        return ('is_other')

def str2List (ustring):
    """Method for somewhat '1-gram' tokennization
    string to tokens by character type, e.g. '倾城L3C' -> ['倾城', 'L', '3', 'C']"""
    retList=[]
    token=''
    old_char_type = 'is_other'
    for i in range(0,len(ustring)):
        uchar = ustring[i]
        new_char_type = char_type(uchar)
        if i == 0 or new_char_type == old_char_type:
            token = token + uchar
            old_char_type = new_char_type
        else:
            retList.append(token)
            token = uchar
            old_char_type = new_char_type
    retList.append(token)
    return(retList) 

In [4]:
""" Loading: phone data scraped from Script S2"""
## Load zol_phone_model: [[phone_brand, zol_name, zol_link], ...]
with open('S2_zol phone model link.csv', newline='',encoding='utf-8') as f:
    file = csv.reader(f)
    zol_phone_model = list(file)
zol_phone_model[0][0] = zol_phone_model[0][0].replace(u'\ufeff', '') # Remove Byte Order Mark character at start of file
print('Sample data loaded', zol_phone_model[1])

## Tokenize name in zol_phone_model: [[phone_brand, zol_name, zol_link, zol_token], ...]
for line in zol_phone_model:
    line.append(rename(line[1], line[0]))
print('Relevant fields: zol_name, zol_link, zol_token')
print('Sample data:\n', zol_phone_model[:1])

Sample data loaded ['三星', '三星GALAXY S7 Edge（G9350/全网通）', '/cell_phone/index1100338.shtml']
Relevant fields: zol_name, zol_link, zol_token
Sample data:
 [['三星', '三星GALAXY Note 7（N9300/全网通）', '/cell_phone/index1116686.shtml', {'GALAXY', '全网通', '9300', '7', 'N', 'NOTE'}]]


In [5]:
""" Loading: tablet data scraped from Script S2"""
import csv
## Load zol_tablet_model: [[zol_name, zol_link], ...]
with open('S2_zol tablet model link.csv', newline='',encoding='utf-8') as f:
    file = csv.reader(f)
    zol_tablet_model = list(file)
zol_tablet_model[0][0] = zol_tablet_model[0][0].replace(u'\ufeff', '') # Remove Byte Order Mark character at start of file
print('Sample data loaded', zol_tablet_model[1])

## Tokenize name in zol_tablet_model: [[zol_name, zol_link, zol_token], ...]
for line in zol_tablet_model:
    line.append(rename(line[0], ""))
## Show sample data
print('Relevant fields: zol_name, zol_link, zol_token')
print('Sample data:\n', zol_tablet_model[:1])

Sample data loaded ['酷比魔方IWORK11手写版', '/tablepc/index1110165.shtml']
Relevant fields: zol_name, zol_link, zol_token
Sample data:
 [['华为揽阅 M2 10.0（64GB/WiFi版）', '/tablepc/index1137541.shtml', {'版', 'WIFI', '华为揽阅', 'GB', '2', '10', '64', 'M'}]]


In [6]:
""" Loading: data pre-processed by MySQL and saved as .csv 
    Possible but complex to load directly as MySQL charset not complient to universal charset
    See detailed loading method in Week 7 journal"""

## Load tk data: [[device_model_index, phone_brand, tk_name, count_device_id], ...]
import csv
import re
with open('S0_device_model_count.csv', newline='',encoding='utf-8') as f:
    file = csv.reader(f)
    tk_device = list(file)
tk_device = sum(tk_device, [])
print('Sample data loaded:', tk_device[:1] )

def format_sql_line (s):
    lst = re.split('"|;', s.replace('"',''))
    return lst
tk_device_models = [format_sql_line(line) for line in tk_device]

## Tokenize name for tk_device_models: [[device_model_index, phone_brand, tk_name, count_device_id, tk_token], ...]
## Add the re-name to each tk_device_model 
for line in tk_device_models:
    line.append(rename(line[2], line[1]))

print('Relevant fields: device_model_index, phone_brand, tk_name, count_device_id, tk_token')
print('Sample data ', tk_device_models[:1])

Sample data loaded: ['1;"爱派尔";"iPh-800";"3"']
Relevant fields: device_model_index, phone_brand, tk_name, count_device_id, tk_token
Sample data  [['1', '爱派尔', 'iPh-800', '3', {'800', 'IPH'}]]


In [24]:
""" Match zol phone with tk device models
    Under each brand:
    Match tk_name with zol_names, using subset method (tk <= zol) """

## Brand_found: tk brands with data on zol 
brands = []
for line in zol_phone_model: brands.append(line[0])
brands_found = list(set(brands))

## Tk device models, with brand data on zol
tk_dm_found = [line for line in tk_device_models if line[1] in brands_found]


def tk_filter(seq, brand_name):
    """ Method to filter tk phone names by brand"""
    return ([line for line in seq if line[1]==brand_name])

def zol_filter(seq, brand_name):
    """ Method to filter zol phone names by brand"""
    return ([line for line in seq if line[0]==brand_name])

def order_match(match_rough):
    """ Method to order list for easy filtering """
    match_1_index = set(line[0] for line in match_rough if line[7]>0) - set(line[0] for line in match_rough if line[7]>1) 
    match_1_ordered = sorted([line for line in match_rough if line[0] in match_1_index], key=lambda line:int(line[0]))
    ## More than one matches
    match_more_index = set(line[0] for line in match_rough if line[7]>1)
    match_more =  [line for line in match_rough if line[0] in match_more_index]
    ## 1. Ordered by increasing token numbers in zol_name (most similar first)
    match_more_ordered = sorted(match_more, key=lambda line:line[6])
    ## 2. Ordered by increasing tk_index
    match_more_ordered = sorted(match_more_ordered, key=lambda line:int(line[0]))
    ## No match
    match_0_ordered = sorted([line for line in match_rough if line[7]==0], key=lambda line:int(line[0]))
    match_ordered = match_1_ordered + match_more_ordered + match_0_ordered
    return(match_ordered)


match_rough = []
for brand_name in brands_found:
    """ Get a rough match for names, using subset method (tk <= zol) """
    tk_dm = tk_filter(tk_dm_found, brand_name)
    zol_dm = zol_filter(zol_phone_model, brand_name)

    ## Match names, record number of matches
    for tk in tk_dm:
        tk_index = tk[0]
        tk_phone_brand = tk[1]
        tk_device_model = tk[2]
        tk_count = tk[3]
        tk_token = tk[4]
        count = 0
        for zol in zol_dm:
            if tk_token.issubset(zol[3]):
                count += 1
                match = [tk_index, tk_phone_brand, tk_count, tk_device_model, zol[1], zol[2], len(zol[3]), count]
                match_rough.append(match)
        if count == 0:
            match = [tk_index, tk_phone_brand, tk_count, tk_device_model, '', '', len(zol[3]), count]
            match_rough.append(match)           
            
for tk in tk_dm_not_found:
    tk_index = tk[0]
    tk_phone_brand = tk[1]
    tk_device_model = tk[2]
    tk_count = tk[3]
    tk_token = tk[4]
    for zol in zol_tablet_model:       
        if tk_token.issubset(zol[2]):
            match = [tk_index, tk_phone_brand, tk_count, tk_device_model, zol[0], zol[1]]
            match_rough.append(match)

## Write the data to CSV files
with open('S3_zol phone match device model_rough.csv','w',newline='',encoding='utf-8-sig') as f:
    a = csv.writer(f)
    a.writerows(match_rough)

In [8]:
## Get back the list

## Load found device models: [[device_model_index, phone_brand, count_device_id, tk_name, zol_name, zol_link], ...]
with open('S3_zol phone match device model_rough.csv', newline='',encoding='utf-8-sig') as f:
    file = csv.reader(f)
    lst = list(file)
lst == match_rough

True

In [7]:
lst=[l for l in match_rough if l[0]!='']
## Set of device_model_index already found
indexes_found = set(l[0] for l in lst)

## Top device_model not found
tk_dm_not_found = [x for x in tk_device_models if x[0] not in indexes_found]
dm_not_found_top = sorted(tk_dm_not_found, key=lambda line:int(line[3]), reverse=True)

print('Precentage of device_model not found:', 1-len(set(indexes_found))/1217)
print('Number of device_model not found:', 1217 - len(set(indexes_found)))
print('Precentage of device_id not found:', sum([int(x[3]) for x in tk_dm_not_found])/58462)
print('Number of device_id not found:', sum([int(x[3]) for x in tk_dm_not_found]))
print('\nTop device_model not found:', dm_not_found_top[:10] )

Precentage of device_model not found: 0.1610517666392769
Number of device_model not found: 196
Precentage of device_id not found: 0.08068488932982108
Number of device_id not found: 4717

Top device_model not found: [['620', 'vivo', 'X5Pro', '386', {'X', '5', 'PRO'}], ['624', 'vivo', 'X6 D', '258', {'X', '6', 'D'}], ['612', 'vivo', 'X3L', '236', {'X', '3', 'L'}], ['617', 'vivo', 'X5M', '224', {'X', '5', 'M'}], ['619', 'vivo', 'X5Max+', '224', {'X', 'PLUS', '5', 'MAX'}], ['621', 'vivo', 'X5SL', '210', {'X', 'SL', '5'}], ['651', 'vivo', 'Y27', '177', {'27', 'Y'}], ['650', 'vivo', 'Y23L', '170', {'23', 'L', 'Y'}], ['630', 'vivo', 'Xplay3S', '151', {'S', '3', 'XPLAY'}], ['616', 'vivo', 'X5L', '142', {'X', '5', 'L'}]]
