In [1]:
import pandas as pd
import numpy as np
from googletrans import Translator
import zipfile

In [2]:
# create zipfile reader
zf = zipfile.ZipFile('competitive-data-science-predict-future-sales.zip')

In [3]:
# read shops
shops = pd.read_csv(zf.open('shops.csv'))

In [4]:
# read item categories
item_categories = pd.read_csv(zf.open('item_categories.csv'))

#### clean data and create new categories

In [5]:
# get cities from shops
shops['city'] = [i[1:] if i[0] == '!' else i for i in shops['shop_name']]
shops['city'] = [i.split(' ')[0] for i in shops['city']]

In [6]:
# get sub categories for item categories
item_categories['sub_category_1'] = [i.split('-')[0] if len(i.split('-')) > 1 else i for i in item_categories['item_category_name']]
item_categories['sub_category_2'] = [i.split('-')[1] if len(i.split('-')) > 1 else 'Not applicable' for i in item_categories['item_category_name']]


#### translate

In [7]:
def translation_dictionary(series= shops['city']):
    """
    This functions uses Google's translate API to translate words from
    different language into English.  This function creates a dictionary 
    that will be mapped into our dimensional data sets.
    
    """
    series_unique = series.unique()
    translated_lst = []
    
    for row in series_unique:
        translator = Translator()
        translated = translator.translate(row, src= 'ru', dest= 'en')
        row = translated.text
        translated_lst.append(row)
        
    dictionary_translated = dict(zip(series_unique, translated_lst))
    return dictionary_translated

In [8]:
trans_dict = translation_dictionary(series= shops['city'])

In [9]:
shops['city_en'] = shops['city'].map(trans_dict)

In [10]:
shops

Unnamed: 0,shop_name,shop_id,city,city_en
0,"!Якутск Орджоникидзе, 56 фран",0,Якутск,Yakutsk
1,"!Якутск ТЦ ""Центральный"" фран",1,Якутск,Yakutsk
2,"Адыгея ТЦ ""Мега""",2,Адыгея,Adygea
3,"Балашиха ТРК ""Октябрь-Киномир""",3,Балашиха,Balashikha
4,"Волжский ТЦ ""Волга Молл""",4,Волжский,Volzhsky
5,"Вологда ТРЦ ""Мармелад""",5,Вологда,Vologda
6,"Воронеж (Плехановская, 13)",6,Воронеж,Voronezh
7,"Воронеж ТРЦ ""Максимир""",7,Воронеж,Voronezh
8,"Воронеж ТРЦ Сити-Парк ""Град""",8,Воронеж,Voronezh
9,Выездная Торговля,9,Выездная,exit


In [11]:
trans_dict = translation_dictionary(series= item_categories['sub_category_1'])
item_categories['sub_category_1_en'] = item_categories['sub_category_1'].map(trans_dict)

In [12]:
trans_dict = translation_dictionary(series= item_categories['sub_category_2'])
item_categories['sub_category_2_en'] = item_categories['sub_category_2'].map(trans_dict)

In [13]:
item_categories

Unnamed: 0,item_category_name,item_category_id,sub_category_1,sub_category_2,sub_category_1_en,sub_category_2_en
0,PC - Гарнитуры/Наушники,0,PC,Гарнитуры/Наушники,PC,Headsets / Headphones
1,Аксессуары - PS2,1,Аксессуары,PS2,accessories,PS2
2,Аксессуары - PS3,2,Аксессуары,PS3,accessories,PS3
3,Аксессуары - PS4,3,Аксессуары,PS4,accessories,PS4
4,Аксессуары - PSP,4,Аксессуары,PSP,accessories,PSP
...,...,...,...,...,...,...
79,Служебные,79,Служебные,Not applicable,System Tools,Not applicable
80,Служебные - Билеты,80,Служебные,Билеты,System Tools,tickets
81,Чистые носители (шпиль),81,Чистые носители (шпиль),Not applicable,Net carriers (spire),Not applicable
82,Чистые носители (штучные),82,Чистые носители (штучные),Not applicable,Net carriers (piece),Not applicable


In [14]:
shops.to_csv('shops_translated.csv', index= False)

In [15]:
item_categories.to_csv('item_categories_translated.csv', index= False)

In [16]:
pd.read_csv('shops_translated.csv')

Unnamed: 0,shop_name,shop_id,city,city_en
0,"!Якутск Орджоникидзе, 56 фран",0,Якутск,Yakutsk
1,"!Якутск ТЦ ""Центральный"" фран",1,Якутск,Yakutsk
2,"Адыгея ТЦ ""Мега""",2,Адыгея,Adygea
3,"Балашиха ТРК ""Октябрь-Киномир""",3,Балашиха,Balashikha
4,"Волжский ТЦ ""Волга Молл""",4,Волжский,Volzhsky
5,"Вологда ТРЦ ""Мармелад""",5,Вологда,Vologda
6,"Воронеж (Плехановская, 13)",6,Воронеж,Voronezh
7,"Воронеж ТРЦ ""Максимир""",7,Воронеж,Voronezh
8,"Воронеж ТРЦ Сити-Парк ""Град""",8,Воронеж,Voronezh
9,Выездная Торговля,9,Выездная,exit
