In [1]:
import json
import os
from collections import defaultdict
import pandas as pd
import numpy as np

In [2]:
read_folder = '../data/raw/chaos/'
write_folder = '../data/clean/'

In [3]:
"""CLASS TO CREATE DATABASES FOR 
    1. Total Battle Counts on Pokemon Showdown
    2. Pokemon Usage Rates
    3. Pokemon Abilities
    4. Pokemon Teammates 
    5. Pokemon Counters 
    6. Pokemon Natures 
"""
class PokeData(): #we'll use this class as our parent
    def __init__(self, columns=[]):
        self.columns = columns
        self.df = pd.DataFrame(columns=columns)  
        
    def _add_new_data(**kwargs):
        return
    
    def update_df(self, **kwargs):
        new_data = self._add_new_data(**kwargs)
        self.df = pd.concat([self.df, new_data], sort=False, ignore_index=True)
        
## MONTHLY POPULARITY EXTRACTION        
class Popularity(PokeData):
    def __init__(self, columns=['month', 'num_battles']):
        PokeData.__init__(self, columns)
        
    def _add_new_data(self, month=str, value=int):
        out = pd.DataFrame([[month, value]], columns=self.columns)
        return out
        
class Usage(PokeData):
    def __init__(self, columns=['id', 'month', 'count', 'usage']):
        PokeData.__init__(self, columns)
        self.ids = dict()
        self.next_id = 0
        self.teammate_df = pd.DataFrame(columns=['id', 'mate_id', 'x', 'month'])
        self.counter_df = pd.DataFrame(columns=['id', 'counter_id', 'num_battles', 'check_pct', 'month'])
        self.abilities = dict()
        self.next_ab = 0
        self.ability_df = pd.DataFrame(columns=['id', 'ability_id', 'count', 'month'])
        self.natures = dict()
        self.next_nat = 0
        self.nature_df = pd.DataFrame(columns=['id', 'nature_id', 'count', 'month'])
        
    def _update_ids(self, name):
        """
        Will check if the name is in a dictionary & return it's value. 
        Otherwise update the dictionary & current count accordingly.
        INPUT:
            dic : dictionary
            x   : int
            name: string
        """ 
        if name not in self.ids:
            self.ids[name] = self.next_id
            self.next_id += 1
            
    def _update_abilities(self, name):
        if name not in self.abilities:
            self.abilities[name] = self.next_ab
            self.next_ab += 1
            
    def _update_natures(self, name):
        if name not in self.natures:
            self.natures[name] = self.next_nat
            self.next_nat += 1
            
    def _add_new_data(self, dic=dict, month=str, targets=dict):
        out = defaultdict(list)
        
        new_ability = defaultdict(list)
        new_nature = defaultdict(list)
        new_counters = defaultdict(list)
        new_teammates = defaultdict(list)
        
        for key, sub in dic.items():
            if sub['usage'] < 0.005:
                continue
                
            self._update_ids(key)
            id_ = self.ids[key]
            out['id'].append(id_)
            out['month'].append(month) 
            
            for k, v in targets.items():
                out[k].append(sub[v])
                
            for ability, count in sub['Abilities'].items():
                self._update_abilities(ability)
                new_ability['id'].append(id_)
                new_ability['ability_id'].append(self.abilities[ability])
                new_ability['count'].append(count)
                new_ability['month'].append(month)
            
            sub_nature_vals = defaultdict(int)
            
            for spread, count in sub['Spreads'].items():
                nature = spread.split(':')[0]
                self._update_natures(nature)
                sub_nature_vals[nature] += count
                
            for nature, count in sub_nature_vals.items():
                new_nature['id'].append(id_)
                new_nature['nature_id'].append(self.natures[nature])
                new_nature['count'].append(count)
                new_nature['month'].append(month)
                
            for counter, arr in sub['Checks and Counters'].items():
                if counter in dic.keys() and arr[1] > 0.5:
                    self._update_ids(counter)
                else:
                    continue
                new_counters['id'].append(id_)
                new_counters['counter_id'].append(self.ids[counter])
                new_counters['num_battles'].append(arr[0])
                new_counters['check_pct'].append(arr[1])
                new_counters['month'].append(month)
                
            for mate, x in sub['Teammates'].items():
                if mate in dic.keys() and dic[mate]['usage'] > 0.005:
                    self._update_ids(mate)
                else:
                    continue
                new_teammates['id'].append(id_)
                new_teammates['mate_id'].append(self.ids[mate])
                new_teammates['x'].append(x)
                new_teammates['month'].append(month)
                
        self.teammate_df = pd.concat([self.teammate_df, 
                                     pd.DataFrame(new_teammates)], 
                                     sort=False, ignore_index=True)
        
        self.counter_df = pd.concat([self.counter_df, 
                                     pd.DataFrame(new_counters)], 
                                     sort=False, ignore_index=True)
                        
        self.ability_df = pd.concat([self.ability_df, 
                                     pd.DataFrame(new_ability)], 
                                     sort=False, ignore_index=True)
        
        self.nature_df = pd.concat([self.nature_df, 
                                     pd.DataFrame(new_nature)], 
                                     sort=False, ignore_index=True)
        out = pd.DataFrame(out)

        return out

In [4]:
popularity = Popularity()
usage = Usage()
for fp in sorted(os.listdir(read_folder)):
    try:
        with open(read_folder+fp, 'r') as f:
            d = json.load(f)
            
        dic = d['data']
        month = fp[-7:-5] + '-' + fp[:4]
        num_battles = d['info']['number of battles']
        
        popularity.update_df(month=month, value=num_battles)
        usage.update_df(dic=dic, month=month, targets={'count':'Raw count',
                                                       'usage':'usage'})
    except:
        pass

In [5]:
popularity.df.to_csv(write_folder+'monthly_popularity.csv', header=True, index=False)

usage.df.to_csv(write_folder+'battle_counts.csv', header=True, index=False)

usage.nature_df.to_csv(write_folder+'nature_counts.csv', header=True, index=False)
usage.ability_df.to_csv(write_folder+'ability_counts.csv', header=True, index=False)

usage.teammate_df.to_csv(write_folder+'teammate_stats.csv', header=True, index=False)
usage.counter_df.to_csv(write_folder+'counter_stats.csv', header=True, index=False)

In [7]:
pokemon_ref = pd.DataFrame(usage.ids.items(), columns=['name', 'id'])[['id', 'name']]
pokemon_ref.to_csv(write_folder+'pokemon_reference.csv', header=True, index=False)

nature_ref = pd.DataFrame(usage.natures.items(), columns=['name', 'id'])[['id', 'name']]
nature_ref.to_csv(write_folder+'nature_reference.csv', header=True, index=False)

ability_ref = pd.DataFrame(usage.abilities.items(), columns=['name', 'id'])[['id', 'name']]
ability_ref.to_csv(write_folder+'ability_reference.csv', header=True, index=False)