In [1]:
# import dill
import pickle

# Сбор данных из vk

In [2]:
import os
import zipfile
import pickle
# из 1 дз
class FileStorage():

    def __init__(self, file_name):
        self.file_name = file_name
        self.user_ids = []

    def read_data(self):
        if not os.path.exists(self.file_name):
            raise StopIteration

        data = {}
        with zipfile.ZipFile(self.file_name) as zf:
            self.user_ids = zf.namelist()
            for file in self.user_ids:
                data[file] = pickle.loads(zf.read(file))
        return data

    def write_data(self, data):
        """
        :param data_array: dict content user name and user data
        """
        with zipfile.ZipFile(self.file_name, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
            for key, value in data.items():
                zf.writestr(key, pickle.dumps(value))

    def append_data(self, data):
        """
        :param data: dict content user name and user data
        """
        old_data = self.read_data()
        old_data.update(data)
        self.write_data(old_data)


In [3]:
import logging

FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
LOG_FILE = 'vk_user_analise.log'

log_formatter = logging.Formatter(FORMAT)
logger = logging.getLogger('vk_user_analise')
logger.setLevel(logging.INFO)

fh = logging.FileHandler(LOG_FILE)
fh.setFormatter(log_formatter)
logger.addHandler(fh)

ch = logging.StreamHandler()
ch.setFormatter(log_formatter)
logger.addHandler(ch)

In [4]:
import logging
from time import sleep,time
import numpy as np         
import vk

loggerS = logging.getLogger('vk_user_analise.scrapper')

class Scrapper(object):
    def __init__(self, delay=1):
        self.user_ids_file = 'users.txt'
        self.delay = delay
        
    def _get_user_ids(self):
        all_users = np.arange(1890000,dtype=np.int)
        np.random.shuffle(all_users)
        user_ids=all_users[:100000]
        
        with open(self.user_ids_file, 'w') as output_file:
            output_file.write(';'.join(map(str, user_ids)))
            
        loggerS.info('total: {} users'.format(len(user_ids)))
        
    def _wait(self, c, s):
        count = c+1
        start = s
        if count >=2:
            count = 0
            tm = time()-start
            if tm <= 1:
                sleep(1)
            start=time()
        return count, start
    
    def scrap_process(self, storage, update=False):
        loggerS.info('Start scrapping')

        if not os.path.exists(self.user_ids_file):
            self._get_user_ids()
        with open(self.user_ids_file, 'r') as f:
            user_ids = f.read().split(';')
        user_total = len(user_ids)

        data = {}
        user_done = []
        if not os.path.exists(storage.file_name) or update:
            storage.write_data(data)
        else:
            storage.read_data()
            user_done = storage.user_ids
        
        login = 'obi-wan.kenobi@list.ru'
#         login='MyLogin'
        password = 'GmFA2yPdzl.d'
#         password = 'MyPass'
        vk_id = '6815038'
#         vk_id = 'MyAppId'
        api_version = '5.92'

        session = vk.AuthSession(app_id=vk_id, user_login=login, user_password=password) 
        api = vk.API(session)
        fields = ['bdate','country','city','counters','education','career','occupation','personal',
                  'relation','sex','about','activities','books','games','interests','movies',
                  'music','personal','quotes','tv']
            
        start = time()
        count = 0
        last_percent = 0
        for i,user_id in enumerate(user_ids):
            try:
                if user_id in user_done:
                    continue
                    
                percent = int(i / user_total * 100)
                user_data = api.users.get(user_ids=user_id, v=api_version, fields=fields)
                count, start = self._wait(count,start)
#                 print(user_id)
#                 print(user_data[0])
                try:
                    if not user_data[0]['is_closed']:
                        user_data[0]['groups'] = api.users.getSubscriptions(user_id=user_id,extended=1,count=200, v=api_version)
                        count, start = self._wait(count,start)
                except KeyError:
                    pass
                data[user_id] = user_data[0]
                
                if len(data)>=100:
                    storage.append_data(data)
                    data={}
                    
                if percent != last_percent:
                    loggerS.info(f'{percent}% done')
                    last_percent = percent
                
                user_done.append(user_id)
                
            except KeyboardInterrupt:
                loggerS.info('Scrapping interrupt by keyboard')
                break
        
        storage.append_data(data)
        
        loggerS.info('Scrapping done')
    

In [5]:
import re
import numpy as np
import pandas as pd

class FeatureExtractor():
    def __init__(self, year):
        self.fields = ['id','first_name','last_name','sex','age','country','city',
                       'videos','audios','photos','friends','groups','university',
                       'activities','books','games','interests','movies','music',
                       'personal','quotes','tv']
        self.count_open = 0
        self.count_closed = 0
        self.count_deleted = 0
        self.year = year
        
    def _get_age(self, data):
        age = np.NaN
        try:
            match = re.search(r'(\d{4})',data['bdate'])
            if match:
                age = int(self.year - int(match.group(1)))
        except KeyError:
            pass
        return age
    
    def _get_sex(self,data):
        return data['sex']
    
    def _get_country(self,data):
        country = np.NaN
        try:
            country = data['country']['title']
            if country == '':
                country = np.NaN
        except KeyError:
            pass
        return country
    
    def _get_city(self,data):
        city = np.NaN
        try:
            city = data['city']['title']
            if city == '':
                city = np.NaN
        except KeyError:
            pass
        return city
    
    def _get_videos(self,data):
        videos = np.NaN
        try:
            videos = data['counters']['videos']
            if videos == '':
                videos = np.NaN
        except KeyError:
            pass
        return videos
    
    def _get_audios(self,data):
        audios=np.NaN
        try:
            audios = data['counters']['audios']
            if audios == '':
                audios = np.NaN
        except KeyError:
            pass
        return audios
    
    def _get_photos(self,data):
        photos=np.NaN
        try:
            photos = data['counters']['photos']
            if photos == '':
                photos = np.NaN
        except KeyError:
            pass
        return photos
    
    def _get_university(self,data):
        university=np.NaN
        try:
            university = data['university_name']
            if university == '':
                university = np.NaN
        except KeyError:
            pass
        return university
    
    def _get_activities(self,data):
        activities=np.NaN
        try:
            activities = data['activities']
            if activities == '':
                activities = np.NaN
        except KeyError:
            pass
        return activities
    
    def _get_books(self,data):
        books=np.NaN
        try:
            books = data['books']
            if books == '':
                books = np.NaN
        except KeyError:
            pass
        return books
    
    def _get_games(self,data):
        games=np.NaN
        try:
            games = data['games']
            if games == '':
                games = np.NaN
        except KeyError:
            pass
        return games
    
    def _get_movies(self,data):
        movies=np.NaN
        try:
            movies = data['movies']
            if movies == '':
                movies = np.NaN
        except KeyError:
            pass
        return movies
    
    def _get_personal(self,data):
        personal=np.NaN
        try:
            personal = data['personal']
            if personal == '':
                personal = np.NaN
        except KeyError:
            pass
        return personal
    
    def _get_quotes(self,data):
        quotes=np.NaN
        try:
            quotes = data['quotes']
            if quotes == '':
                quotes = np.NaN
        except KeyError:
            pass
        return quotes
    
    def _get_tv(self,data):
        tv=np.NaN
        try:
            tv = data['tv']
            if tv == '':
                tv = np.NaN
        except KeyError:
            pass
        return tv
    
    def _get_groups(self,data):
        groups=np.NaN
        try:
            groups = ','.join([gr['name'] for gr in data['groups']['items'] if gr['type']=='page'])
            groups=groups.replace('\t',' ')
            groups=groups.replace('\n','')
            groups=groups.replace('\r','')
            groups=groups.replace(', ,',',')
            if groups == '':
                groups = np.NaN
        except KeyError:
            pass
        return groups
    
    def _get_music(self,data):
        music=np.NaN
        try:
            music = data['music']
            if music == '':
                music = np.NaN
        except KeyError:
            pass
        return music
    
    def _get_friends(self,data):
        friends=np.NaN
        try:
            friends = data['counters']['friends']
            if friends == '':
                friends = np.NaN
        except KeyError:
            pass
        return friends
    
    def _get_interests(self,data):
        interests=np.NaN
        try:
            interests = data['interests']
            if interests == '':
                interests = np.NaN
        except KeyError:
            pass
        return interests
        
    def transform(self, data):
        df_data = {field:[] for field in self.fields}
#         count = 0
#         max_count = 10
        for user_id, user_data in data.items():
#             count+=1
#             if count>max_count:
#                 break

            try:
                if user_data['is_closed']:
                    self.count_closed+=1
                    continue
            except KeyError:
                self.count_deleted+=1
                continue
            self.count_open+=1
            # обязательные поля
            df_data['id'].append(user_id)
            df_data['first_name'].append(user_data['first_name'])
            df_data['last_name'].append(user_data['last_name'])
            # необязательные
            df_data['sex'].append(self._get_sex(user_data))
            df_data['age'].append(self._get_age(user_data))
            df_data['country'].append(self._get_country(user_data))
            df_data['city'].append(self._get_city(user_data))
            df_data['videos'].append(self._get_videos(user_data))
            df_data['audios'].append(self._get_audios(user_data))
            df_data['photos'].append(self._get_photos(user_data))
            df_data['friends'].append(self._get_friends(user_data))
            df_data['groups'].append(self._get_groups(user_data))
            df_data['university'].append(self._get_university(user_data))
            df_data['activities'].append(self._get_activities(user_data))
            df_data['books'].append(self._get_books(user_data))
            df_data['games'].append(self._get_games(user_data))
            df_data['movies'].append(self._get_movies(user_data))
            df_data['music'].append(self._get_music(user_data))
            df_data['personal'].append(self._get_personal(user_data))
            df_data['quotes'].append(self._get_quotes(user_data))
            df_data['tv'].append(self._get_tv(user_data))
            df_data['interests'].append(self._get_interests(user_data))
        return pd.DataFrame(df_data)


In [6]:
update_data=False
collect_data=False
# collect_data=True

In [7]:
data_file = 'test.zip'
fs = FileStorage(data_file)
if collect_data:
    sc = Scrapper()
    sc.scrap_process(fs,update_data)
# fs.read_data()

# Анализ данных

In [8]:
# collect_data=True
collect_data=False
if collect_data:
    fe = FeatureExtractor(2018)
    df = fe.transform(fs.read_data())
    print(fe.count_open, fe.count_closed, fe.count_deleted)
    df.to_csv("data.csv", index=False, sep='\t')
else:
    df = pd.read_csv("data.csv", sep='\t',dtype={'id': np.int,'sex':np.int})
#     df['age'] = pd.to_numeric(df['age'], errors='coerce')
#     df['age'] = df['age'].fillna(0).astype(np.int)
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65569 entries, 0 to 65568
Data columns (total 22 columns):
id            65569 non-null int64
first_name    65568 non-null object
last_name     65567 non-null object
sex           65569 non-null int64
age           26585 non-null float64
country       61093 non-null object
city          57448 non-null object
videos        65569 non-null int64
audios        65569 non-null int64
photos        65569 non-null int64
friends       65569 non-null int64
groups        52392 non-null object
university    15397 non-null object
activities    5147 non-null object
books         5376 non-null object
games         4466 non-null object
interests     6342 non-null object
movies        5807 non-null object
music         5961 non-null object
personal      12885 non-null object
quotes        6505 non-null object
tv            3971 non-null object
dtypes: float64(1), int64(6), object(15)
memory usage: 11.0+ MB


In [9]:
import pandas_profiling

pandas_profiling.ProfileReport(df)

0,1
Number of variables,22
Number of observations,65569
Total Missing (%),45.0%
Total size in memory,11.0 MiB
Average record size in memory,176.0 B

0,1
Numeric,6
Categorical,15
Boolean,1
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,4618
Unique (%),7.0%
Missing (%),92.2%
Missing (n),60422

0,1
нет,26
работаю,24
Студент,19
Other values (4614),5078
(Missing),60422

Value,Count,Frequency (%),Unnamed: 3
нет,26,0.0%,
работаю,24,0.0%,
Студент,19,0.0%,
-,18,0.0%,
Работаю,17,0.0%,
студент,16,0.0%,
Фотограф,14,0.0%,
1,14,0.0%,
работа,12,0.0%,
бурная,11,0.0%,

0,1
Distinct count,100
Unique (%),0.2%
Missing (%),59.5%
Missing (n),38984
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,34.289
Minimum,13
Maximum,116
Zeros (%),0.0%

0,1
Minimum,13
5-th percentile,27
Q1,30
Median,32
Q3,36
95-th percentile,47
Maximum,116
Range,103
Interquartile range,6

0,1
Standard deviation,8.6526
Coef of variation,0.25235
Kurtosis,27.756
Mean,34.289
MAD,4.9423
Skewness,4.3399
Sum,911560
Variance,74.867
Memory size,512.3 KiB

Value,Count,Frequency (%),Unnamed: 3
30.0,2882,4.4%,
31.0,2851,4.3%,
32.0,2555,3.9%,
29.0,2475,3.8%,
33.0,2330,3.6%,
34.0,1969,3.0%,
35.0,1622,2.5%,
28.0,1510,2.3%,
36.0,1258,1.9%,
37.0,996,1.5%,

Value,Count,Frequency (%),Unnamed: 3
13.0,2,0.0%,
14.0,6,0.0%,
15.0,7,0.0%,
16.0,4,0.0%,
17.0,3,0.0%,

Value,Count,Frequency (%),Unnamed: 3
112.0,4,0.0%,
113.0,8,0.0%,
114.0,3,0.0%,
115.0,4,0.0%,
116.0,12,0.0%,

0,1
Distinct count,2277
Unique (%),3.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,145.2
Minimum,0
Maximum,10000
Zeros (%),58.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,87
95-th percentile,775
Maximum,10000
Range,10000
Interquartile range,87

0,1
Standard deviation,444.6
Coef of variation,3.062
Kurtosis,112.7
Mean,145.2
MAD,211.29
Skewness,8.2221
Sum,9520546
Variance,197670
Memory size,512.3 KiB

Value,Count,Frequency (%),Unnamed: 3
0,38496,58.7%,
1,998,1.5%,
2,581,0.9%,
3,467,0.7%,
4,328,0.5%,
5,313,0.5%,
6,246,0.4%,
7,231,0.4%,
10,202,0.3%,
8,201,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0,38496,58.7%,
1,998,1.5%,
2,581,0.9%,
3,467,0.7%,
4,328,0.5%,

Value,Count,Frequency (%),Unnamed: 3
8766,1,0.0%,
8953,1,0.0%,
9975,1,0.0%,
9999,1,0.0%,
10000,13,0.0%,

0,1
Distinct count,4922
Unique (%),7.5%
Missing (%),91.8%
Missing (n),60193

0,1
Мастер и Маргарита,33
нет,31
много,20
Other values (4918),5292
(Missing),60193

Value,Count,Frequency (%),Unnamed: 3
Мастер и Маргарита,33,0.1%,
нет,31,0.0%,
много,20,0.0%,
-,18,0.0%,
Азбука,15,0.0%,
фантастика,14,0.0%,
Фантастика,11,0.0%,
Много,10,0.0%,
не читаю,9,0.0%,
Библия,9,0.0%,

0,1
Distinct count,1766
Unique (%),2.7%
Missing (%),12.4%
Missing (n),8121

0,1
Санкт-Петербург,23876
Москва,16606
Новосибирск,593
Other values (1762),16373
(Missing),8121

Value,Count,Frequency (%),Unnamed: 3
Санкт-Петербург,23876,36.4%,
Москва,16606,25.3%,
Новосибирск,593,0.9%,
Екатеринбург,524,0.8%,
Нижний Новгород,432,0.7%,
Уфа,384,0.6%,
Киев,364,0.6%,
Казань,350,0.5%,
Краснодар,300,0.5%,
Ростов-на-Дону,279,0.4%,

0,1
Distinct count,155
Unique (%),0.2%
Missing (%),6.8%
Missing (n),4476

0,1
Россия,55477
Украина,1055
США,608
Other values (151),3953
(Missing),4476

Value,Count,Frequency (%),Unnamed: 3
Россия,55477,84.6%,
Украина,1055,1.6%,
США,608,0.9%,
Казахстан,507,0.8%,
Беларусь,447,0.7%,
Германия,424,0.6%,
Израиль,196,0.3%,
Великобритания,190,0.3%,
Узбекистан,114,0.2%,
Канада,110,0.2%,

0,1
Distinct count,6212
Unique (%),9.5%
Missing (%),0.0%
Missing (n),1

0,1
Александр,2260
Сергей,1747
Алексей,1701
Other values (6208),59860

Value,Count,Frequency (%),Unnamed: 3
Александр,2260,3.4%,
Сергей,1747,2.7%,
Алексей,1701,2.6%,
Ольга,1667,2.5%,
Дмитрий,1667,2.5%,
Елена,1641,2.5%,
Андрей,1596,2.4%,
Анна,1448,2.2%,
Екатерина,1403,2.1%,
Ирина,1261,1.9%,

0,1
Distinct count,2285
Unique (%),3.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,247.61
Minimum,0
Maximum,10000
Zeros (%),10.6%

0,1
Minimum,0
5-th percentile,0
Q1,52
Median,148
Q3,275
95-th percentile,710
Maximum,10000
Range,10000
Interquartile range,223

0,1
Standard deviation,523.16
Coef of variation,2.1128
Kurtosis,129.58
Mean,247.61
MAD,214.21
Skewness,9.72
Sum,16235788
Variance,273700
Memory size,512.3 KiB

Value,Count,Frequency (%),Unnamed: 3
0,6938,10.6%,
1,1180,1.8%,
2,571,0.9%,
3,406,0.6%,
4,292,0.4%,
6,265,0.4%,
5,239,0.4%,
98,205,0.3%,
112,204,0.3%,
76,200,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0,6938,10.6%,
1,1180,1.8%,
2,571,0.9%,
3,406,0.6%,
4,292,0.4%,

Value,Count,Frequency (%),Unnamed: 3
9961,1,0.0%,
9976,1,0.0%,
9991,1,0.0%,
9997,2,0.0%,
10000,1,0.0%,

0,1
Distinct count,3684
Unique (%),5.6%
Missing (%),93.2%
Missing (n),61103

0,1
Жизнь,42
нет,39
прятки,35
Other values (3680),4350
(Missing),61103

Value,Count,Frequency (%),Unnamed: 3
Жизнь,42,0.1%,
нет,39,0.1%,
прятки,35,0.1%,
футбол,28,0.0%,
Футбол,27,0.0%,
жизнь,25,0.0%,
-,23,0.0%,
ролевые,21,0.0%,
шахматы,14,0.0%,
на нервах,13,0.0%,

0,1
Distinct count,51514
Unique (%),78.6%
Missing (%),20.1%
Missing (n),13177

0,1
Random Art,21
НОВЫЕ ФИЛЬМЫ 2019| КИНО НОВИНКИ + КиноБОТ,19
"Интересная планета - путешествия, туризм",16
Other values (51510),52336
(Missing),13177

Value,Count,Frequency (%),Unnamed: 3
Random Art,21,0.0%,
НОВЫЕ ФИЛЬМЫ 2019| КИНО НОВИНКИ + КиноБОТ,19,0.0%,
"Интересная планета - путешествия, туризм",16,0.0%,
ДТП и ЧП | Санкт-Петербург | Питер Онлайн | СПб,16,0.0%,
Интересная Москва,11,0.0%,
Автолюбитель,11,0.0%,
Музыка,11,0.0%,
Begin English. Английский язык для всех,10,0.0%,
Лепра,9,0.0%,
Интересные события в Санкт-Петербурге,9,0.0%,

0,1
Distinct count,65569
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,938390
Minimum,18
Maximum,1889983
Zeros (%),0.0%

0,1
Minimum,18
5-th percentile,91763
Q1,464580
Median,928400
Q3,1413400
95-th percentile,1794800
Maximum,1889983
Range,1889965
Interquartile range,948840

0,1
Standard deviation,546320
Coef of variation,0.58219
Kurtosis,-1.2024
Mean,938390
MAD,473240
Skewness,0.017303
Sum,61529214122
Variance,298470000000
Memory size,512.3 KiB

Value,Count,Frequency (%),Unnamed: 3
19105,1,0.0%,
1404793,1,0.0%,
1534584,1,0.0%,
332407,1,0.0%,
596598,1,0.0%,
66165,1,0.0%,
1247860,1,0.0%,
1258099,1,0.0%,
1649264,1,0.0%,
1294957,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
18,1,0.0%,
22,1,0.0%,
24,1,0.0%,
36,1,0.0%,
54,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1889821,1,0.0%,
1889834,1,0.0%,
1889935,1,0.0%,
1889948,1,0.0%,
1889983,1,0.0%,

0,1
Distinct count,5842
Unique (%),8.9%
Missing (%),90.3%
Missing (n),59227

0,1
разные,25
Спорт,21
спорт,21
Other values (5838),6275
(Missing),59227

Value,Count,Frequency (%),Unnamed: 3
разные,25,0.0%,
Спорт,21,0.0%,
спорт,21,0.0%,
Жизнь,18,0.0%,
Музыка,18,0.0%,
-,16,0.0%,
разнообразные,15,0.0%,
музыка,15,0.0%,
Фотография,14,0.0%,
жизнь,14,0.0%,

0,1
Distinct count,33540
Unique (%),51.2%
Missing (%),0.0%
Missing (n),2

0,1
Иванов,525
Иванова,444
Смирнова,210
Other values (33536),64388

Value,Count,Frequency (%),Unnamed: 3
Иванов,525,0.8%,
Иванова,444,0.7%,
Смирнова,210,0.3%,
Петров,202,0.3%,
Смирнов,182,0.3%,
Петрова,177,0.3%,
Васильев,164,0.3%,
Кузнецов,154,0.2%,
Васильева,148,0.2%,
Кузнецова,135,0.2%,

0,1
Distinct count,5384
Unique (%),8.2%
Missing (%),91.1%
Missing (n),59762

0,1
много,36
комедии,20
нет,18
Other values (5380),5733
(Missing),59762

Value,Count,Frequency (%),Unnamed: 3
много,36,0.1%,
комедии,20,0.0%,
нет,18,0.0%,
Комедии,17,0.0%,
-,16,0.0%,
Достучаться до небес,15,0.0%,
Много,15,0.0%,
их много,14,0.0%,
все,9,0.0%,
...,8,0.0%,

0,1
Distinct count,5263
Unique (%),8.0%
Missing (%),90.9%
Missing (n),59608

0,1
разная,37
Разная,30
рок,27
Other values (5259),5867
(Missing),59608

Value,Count,Frequency (%),Unnamed: 3
разная,37,0.1%,
Разная,30,0.0%,
рок,27,0.0%,
Меломан,27,0.0%,
по настроению,25,0.0%,
По настроению,23,0.0%,
Рок,21,0.0%,
под настроение,19,0.0%,
Под настроение,17,0.0%,
тишина,16,0.0%,

0,1
Distinct count,5657
Unique (%),8.6%
Missing (%),80.3%
Missing (n),52684

0,1
{'langs': ['Русский']},2012
{'political': 3},654
"{'people_main': 0, 'life_main': 0, 'smoking': 0, 'alcohol': 0}",506
Other values (5653),9713
(Missing),52684

Value,Count,Frequency (%),Unnamed: 3
{'langs': ['Русский']},2012,3.1%,
{'political': 3},654,1.0%,
"{'people_main': 0, 'life_main': 0, 'smoking': 0, 'alcohol': 0}",506,0.8%,
"{'langs': ['Русский'], 'people_main': 0, 'life_main': 0, 'smoking': 0, 'alcohol': 0}",504,0.8%,
{'political': 8},368,0.6%,
{'political': 4},291,0.4%,
"{'political': 3, 'langs': ['Русский']}",227,0.3%,
"{'political': 3, 'religion': 'Православие'}",126,0.2%,
"{'political': 8, 'langs': ['Русский']}",119,0.2%,
"{'langs': ['Русский', 'English']}",115,0.2%,

0,1
Distinct count,2952
Unique (%),4.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,259.11
Minimum,0
Maximum,94069
Zeros (%),17.6%

0,1
Minimum,0
5-th percentile,0
Q1,3
Median,41
Q3,216
95-th percentile,1130
Maximum,94069
Range,94069
Interquartile range,213

0,1
Standard deviation,943.31
Coef of variation,3.6406
Kurtosis,2076.2
Mean,259.11
MAD,327.5
Skewness,31.459
Sum,16989601
Variance,889840
Memory size,512.3 KiB

Value,Count,Frequency (%),Unnamed: 3
0,11551,17.6%,
1,2990,4.6%,
2,1683,2.6%,
3,1264,1.9%,
4,1013,1.5%,
5,873,1.3%,
6,787,1.2%,
7,724,1.1%,
8,689,1.1%,
9,646,1.0%,

Value,Count,Frequency (%),Unnamed: 3
0,11551,17.6%,
1,2990,4.6%,
2,1683,2.6%,
3,1264,1.9%,
4,1013,1.5%,

Value,Count,Frequency (%),Unnamed: 3
36405,1,0.0%,
43976,1,0.0%,
46681,1,0.0%,
54083,1,0.0%,
94069,1,0.0%,

0,1
Distinct count,6332
Unique (%),9.7%
Missing (%),90.1%
Missing (n),59064

0,1
нет,41
-,18
,15
Other values (6328),6431
(Missing),59064

Value,Count,Frequency (%),Unnamed: 3
нет,41,0.1%,
-,18,0.0%,
,15,0.0%,
...,14,0.0%,
.,9,0.0%,
много,8,0.0%,
1,6,0.0%,
Нет,5,0.0%,
,5,0.0%,
8,5,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,1.4927

0,1
1,33261
2,32308

Value,Count,Frequency (%),Unnamed: 3
1,33261,50.7%,
2,32308,49.3%,

0,1
Distinct count,3058
Unique (%),4.7%
Missing (%),93.9%
Missing (n),61598

0,1
нет,145
КВН,57
не смотрю,45
Other values (3054),3724
(Missing),61598

Value,Count,Frequency (%),Unnamed: 3
нет,145,0.2%,
КВН,57,0.1%,
не смотрю,45,0.1%,
-,42,0.1%,
Comedy Club,24,0.0%,
нету,22,0.0%,
Новости,22,0.0%,
Не смотрю,21,0.0%,
Нет,17,0.0%,
Что? Где? Когда?,17,0.0%,

0,1
Distinct count,2052
Unique (%),3.1%
Missing (%),76.5%
Missing (n),50172

0,1
СПбГУ,761
МГУ,519
СПбПУ Петра Великого (Политех),462
Other values (2048),13655
(Missing),50172

Value,Count,Frequency (%),Unnamed: 3
СПбГУ,761,1.2%,
МГУ,519,0.8%,
СПбПУ Петра Великого (Политех),462,0.7%,
СПбГЭУ,402,0.6%,
РГПУ им. А. И. Герцена,244,0.4%,
СПбГЭТУ (ЛЭТИ),213,0.3%,
СПбГИК (бывш. СПбГУКИ),173,0.3%,
МГТУ им. Н. Э. Баумана,165,0.3%,
СПбГУТ им. Бонч-Бруевича,161,0.2%,
СПбГУАП,157,0.2%,

0,1
Distinct count,1594
Unique (%),2.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,96.695
Minimum,0
Maximum,7692
Zeros (%),22.4%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,19
Q3,84
95-th percentile,421
Maximum,7692
Range,7692
Interquartile range,83

0,1
Standard deviation,264.12
Coef of variation,2.7314
Kurtosis,135.72
Mean,96.695
MAD,119.11
Skewness,9.1757
Sum,6340209
Variance,69758
Memory size,512.3 KiB

Value,Count,Frequency (%),Unnamed: 3
0,14702,22.4%,
1,2768,4.2%,
2,1828,2.8%,
3,1462,2.2%,
4,1298,2.0%,
5,1104,1.7%,
6,1069,1.6%,
8,891,1.4%,
7,886,1.4%,
9,806,1.2%,

Value,Count,Frequency (%),Unnamed: 3
0,14702,22.4%,
1,2768,4.2%,
2,1828,2.8%,
3,1462,2.2%,
4,1298,2.0%,

Value,Count,Frequency (%),Unnamed: 3
6235,1,0.0%,
6278,1,0.0%,
6630,1,0.0%,
7169,1,0.0%,
7692,1,0.0%,

Unnamed: 0,id,first_name,last_name,sex,age,country,city,videos,audios,photos,friends,groups,university,activities,books,games,interests,movies,music,personal,quotes,tv
0,1768318,Мария,Свирид,1,41.0,Россия,Санкт-Петербург,89,0,1836,274,"Интересные события в Санкт-Петербурге,Вдохнове...",,,,,,,,,,
1,1309850,Денис,Еременко,2,,Россия,Краснодар,77,0,818,369,"ЁП,fresh_home,Телеканал «РОССИЯ»,Типичный Крас...",,,,,,,,,,
2,1391546,Слава,Кузнецов,2,,Россия,Красноярск,0,0,0,1,,КрасГАУ,,,,,,,,,
3,945010,Алиса,Мельникова,1,31.0,Россия,Москва,66,599,648,157,"я В шоке!,Тонкий юмор,Киномир • Новинки 2018,...",,,,,,,,,,
4,1031069,Ирина,Осипова,1,44.0,Россия,Санкт-Петербург,14,32,3,73,Санкт-Петербург - это мой город! | Питер | СПБ...,,,,,,,,,,


Возраст распределен в пределах 25-50 лет в основном, остальные или скрыты, или завышены. Поэтому предсказывать можно только в этом диапозоне.
В городах много уникальных названий, но если предположить, что преимущественно живут в России в крупных городах. Те кто скрывают свое местоположение, окажутся в шуме. Поэтому можно попробовать предсказать город по группам в которых состоит человек.

# Предсказание города в котором живет человек по группам в которых состоит


Логично вытаскивать названия из групп. Но далеко не у всех пользователей в группах есть названия городов. Так же группы вк представляют интересы основных групп пользователей в конкретных городах. Я бы еще и признак возраста добавил бы, но он в маленьком диапазоне. Для интереса вообще бы названия городов исключил

Разобьем группы и на обучающую и тестовую, на тестовой проверим после нахождения параметров модели.

In [10]:
try:
    with open('df_train.pkl', 'rb') as f:
        df_train = pickle.load(f)
    with open('df_test.pkl', 'rb') as f:
        df_test = pickle.load(f)
except FileNotFoundError:
    l = int(df.shape[0]/2)
    df_train = df[:l]
    df_test = df[l:]
    with open('df_train.pkl', 'wb') as f:
        pickle.dump(df_train, f)
    with open('df_test.pkl', 'wb') as f:
        pickle.dump(df_test, f)
del df
print(df_train.shape)
print(df_test.shape)


(32784, 22)
(32785, 22)


In [11]:
# import nltk
# nltk.download('punkt')
# nltk.download("stopwords")

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
# pip install git+git://github.com/scikit-learn/scikit-learn.git
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
import nltk
import string


def my_tokenizer(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(RussianStemmer().stem(item))
    return stems

class Get_xy():
    def __init__(self,n_features=200):
        self.le = None
        self.vectorizer = None
        self.delete_str = []
        self.n_features = n_features
    
    def fit(self, df):
        df_xy = df.copy()
        
        self.le = LabelEncoder()
#         self.le = CategoricalEncoder(handle_unknown='ignore') 
        self.le.fit(df_xy['city'])
        
        stop_words = stopwords.words('russian')
        stop_words.extend(stopwords.words('english'))
        stop_words.extend(['©', '«', '»','—', '•', '№', '●', '★', '♥', 'ツ'])
        stop_words = [my_tokenizer(sw)[0] for sw in stop_words]
        self.delete_str = string.punctuation + '1234567890'
        df_xy['groups_clean'] = [s.lower().translate(str.maketrans(self.delete_str, ' '*len(self.delete_str))) 
                             for s in df_xy['groups']]
        
        self.vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                 max_features=self.n_features,
                                 stop_words=stop_words,
                                 tokenizer=my_tokenizer)
        self.vectorizer.fit(df_xy['groups_clean'])
        
        
    def transform(self, df):
        df_xy = df.copy()
        df_xy['groups_clean'] = [s.lower().translate(str.maketrans(self.delete_str, ' '*len(self.delete_str))) 
                             for s in df_xy['groups']]
        y = self.le.transform(df_xy['city'])
        X = self.vectorizer.transform(df_xy['groups_clean'])
        
        return X,y
    
    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

try:
    with open('xy_train.pkl', 'rb') as f:
        X = pickle.load(f)
        y = pickle.load(f)
        get_xy = pickle.load(f)
except FileNotFoundError:
    df_xy = df_train.dropna(subset=['city', 'groups'])
    get_xy = Get_xy()
    X,y = get_xy.fit_transform(df_xy)
    del df_xy
    with open('xy_train.pkl', 'wb') as f:
        pickle.dump(X, f)
        pickle.dump(y, f)
        pickle.dump(get_xy, f)
print(get_xy.vectorizer.get_feature_names())

['aliexpress', 'appét', 'art', 'bon', 'club', 'cook', 'decor', 'design', 'e', 'english', 'fashion', 'fitness', 'gif', 'handmad', 'hd', 'histor', 'kudag', 'l', 'lif', 'liv', 'music', 'onlin', 'ru', 'scienc', 'u', 'vide', 'world', 'авт', 'академ', 'английск', 'анекдот', 'арт', 'аудиокниг', 'афиш', 'барахолк', 'бесплатн', 'бизнес', 'блог', 'ваш', 'вдохновен', 'велик', 'виде', 'вконтакт', 'вкусн', 'вопрос', 'вязан', 'г', 'город', 'горя', 'групп', 'дар', 'дач', 'девушк', 'декор', 'ден', 'дет', 'детск', 'дизайн', 'дневник', 'добр', 'дом', 'домашн', 'дтп', 'душ', 'женск', 'женщин', 'жизн', 'журна', 'здоров', 'игр', 'ид', 'интеллектуальн', 'интересн', 'интернет', 'интерьер', 'искусств', 'истор', 'ищ', 'йог', 'кадр', 'кажд', 'квартир', 'квартирн', 'кин', 'киноман', 'клуб', 'книг', 'кот', 'красот', 'кулинар', 'кулинарн', 'кухн', 'лайфхак', 'лепр', 'литератур', 'любов', 'люд', 'магазин', 'макияж', 'мам', 'маникюр', 'мастер', 'мастерск', 'мир', 'мод', 'модел', 'москв', 'мотивац', 'мужск', 'музык',

In [13]:
# dill.dump_session('hw_vector.db')

In [14]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV, cross_val_score
from scipy.stats import randint,expon
from time import time
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score
from sklearn.model_selection import train_test_split

def report(search_res):
    print("Best param: {0}".format(search_res.best_params_))
    print("Best score: {0}".format(search_res.best_score_))

def test_model(model,X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model.fit(X_train, y_train)
    print("score:",accuracy_score(model.predict(X_test), y_test))

def gs_model(model,param,X,y,n_iter_search = 20, cv=3, jobs=4, verbose=10):

#     rmse_scorer = make_scorer(mean_squared_error)
#     search = RandomizedSearchCV(model, param_distributions=param, scoring=rmse_scorer,
#                                    n_iter=n_iter_search, cv=cv, n_jobs=4)
    search = RandomizedSearchCV(model, param_distributions=param, scoring="accuracy",
                                   n_iter=n_iter_search, cv=cv, n_jobs=jobs, verbose=verbose, pre_dispatch=4)
#     search = RandomizedSearchCV(model, param_distributions=param, scoring="neg_mean_squared_error",
#                                    n_iter=n_iter_search, cv=cv, verbose=verbose)
#     search = GridSearchCV(model, param, scoring="neg_mean_squared_error", cv=cv, n_jobs=4, verbose=10)
    start=time()
    search.fit(X, y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time() - start), n_iter_search))
#     report(search)
    return search

def cv_model(model,X,y,jobs=4):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy',n_jobs=jobs)
    print(scores)
    print(scores.mean(),scores.std())

In [15]:
search_cv = 2

In [16]:
from sklearn.linear_model import RidgeClassifier

try:
    with open('lc.pkl', 'rb') as f:
        src = pickle.load(f)
except FileNotFoundError:
    lin_cls = RidgeClassifier()

    # test_model(lin_cls,X,y)
    param = {"alpha":[0.001,0.1,1,2,3,5,10,100,1000]}
    src = gs_model(lin_cls, param,X,y,cv=search_cv)
    with open('lc.pkl', 'wb') as f:
        pickle.dump(src, f)
    del lin_cls

report(src)
# model = src.best_estimator_
# cv_model(model,X,y)

# dill.dump_session('hw_lc.db')

Best param: {'alpha': 100}
Best score: 0.6086975040463413


In [17]:
from sklearn.tree import DecisionTreeClassifier

try:
    with open('tc.pkl', 'rb') as f:
        src = pickle.load(f)
except FileNotFoundError:
    tree_cls = DecisionTreeClassifier()

    # test_model(tree_cls,X,y)
    param = {"max_depth":[None,2,5,10,15],
            "min_samples_split":[2,5,10,20,50,100]}
    src = gs_model(tree_cls, param,X,y,cv=search_cv)
    with open('tc.pkl', 'wb') as f:
        pickle.dump(src, f)
    del tree_cls
    
report(src)
# model = src.best_estimator_
# cv_model(model,X,y)

# dill.dump_session('hw_tc.db')

Best param: {'min_samples_split': 10, 'max_depth': 5}
Best score: 0.6055456171735242


In [18]:
import xgboost as xgb

try:
    with open('xc1.pkl', 'rb') as f:
        src = pickle.load(f)
except FileNotFoundError:
    xgb_cls = xgb.XGBClassifier(nthread=4, n_estimators=20)
    # test_model(xgb_reg,vec,df)
    param = {
    #         "n_estimators":[50,100,150,200],
            "max_depth":[2,3,5,10,15],
            "learning_rate":[0.1,1,10],
             "reg_lambda":[0.1,1,10]
            }
    src = gs_model(xgb_cls,  param,X,y,jobs=1,cv=search_cv)
    with open('xc1.pkl', 'wb') as f:
        pickle.dump(src, f)
    del xgb_cls
    
report(src)

try:
    with open('xc2.pkl', 'rb') as f:
        src = pickle.load(f)
except FileNotFoundError:
    model = src.best_estimator_
    param = {"n_estimators":[50,100,150,200]}
    src = gs_model(model,  param,X,y,jobs=1,cv=search_cv)
    with open('xc2.pkl', 'wb') as f:
        pickle.dump(src, f)

report(src)
# model = src.best_estimator_
# cv_model(model,X,y)

# dill.dump_session('hw_xc.db')

Best param: {'reg_lambda': 10, 'max_depth': 15, 'learning_rate': 0.1}
Best score: 0.5986029474401567
Best param: {'n_estimators': 100}
Best score: 0.6076752704659681


В целом предсказание работает, модели работают в целом одинаково. Значение в районе 60% определяется скорее всего из-за шума. Для улучшения предсказания, можно почистить данные и удалить тех, кто не в России или в некрупных городах.

# Проверка на тестовой выборке

In [74]:
# dill.load_session('hw_lc.db')
# with open('xy_train.pkl', 'rb') as f:
#     X = pickle.load(f)
#     y = pickle.load(f)
#     get_xy = pickle.load(f)

try:
    with open('xy_test.pkl', 'rb') as f:
        X = pickle.load(f)
        y = pickle.load(f)
#     with open('xy_train.pkl', 'rb') as f:
#         X_train = pickle.load(f)
#         y_train = pickle.load(f)
except FileNotFoundError:
    df_xy = df_test.dropna(subset=['city', 'groups'])
    
    # новые города, которых нет в обучающей выборке
    cls = get_xy.le.classes_.tolist()
    [cls.append(n) for n in df_xy['city'] if n not in cls]
    get_xy.le.classes_ = np.array(cls)
    
    X,y = get_xy.transform(df_xy)
    with open('xy_test.pkl', 'wb') as f:
        pickle.dump(X, f)
        pickle.dump(y, f)
        
with open('lc.pkl', 'rb') as f:
    src = pickle.load(f)
model = src.best_estimator_
# model.fit(X_train,y_train)
print("score:",accuracy_score(model.predict(X), y))

score: 0.6029917980536313


Качество модели не отличается от обучения, модель не переобучилась и относительно устойчива к шуму.