In [84]:
from pymongo import MongoClient
import pandas as pd
from bson.objectid import ObjectId
import numpy as np
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook
# from topics_by_id import get_topic_by_id

In [75]:
# %load topics_by_id.py
def get_topic_by_id(id):
    TOPICS_NUMBERING = {
        0: "Accidents with children",
        1: "Local authorities: appointments, resignations & statements",
        2: "*Urban development (misc)",
        3: "IT & military high tech",
        4: "Fires",
        5: "Urban demography & housing payments",
        6: "Police actions drug, alcohol & counterfeit money crimes",
        7: "Omsk region industrial developemnt",
        8: "Ads of banking services",
        9: "Holidays and VIP-weddingss",
        10: "Trials on economic crime",
        11: "Public transport and traffic",
        12: "Weather",
        13: "Yury Gamburg resignation",
        14: "*NATO warships in Black sea, Russian rocket launch & contests",
        15: "Regional parliament activities",
        16: "Urban landscaping & greening",
        17: "Schools, orphanages & child charity",
        18: "Car sales",
        19: "Car accidents",
        20: "Education",
        21: "Real estate: contruction",
        22: "Art and literature",
        23: "Street & bridge reconstruction & maintenance",
        24: "Housing & the case of disabled Akhmetov",
        25: "Stray dogs & doghunters",
        26: "Urban events & openings",
        27: "Crimea accession",
        28: "Concerts",
        29: "Sport, hockey",
        30: "Hockey",
        31: "Control & regulation of enterprises",
        32: "Regional taxes & fuel prices",
        33: "Missing persons announcements",
        34: "Theaters & festivals",
        35: "*Omsk media, plants & animals",
        36: "Local authorities: appointments, resignations & statements",
        37: "Macroeconomic events: currency rates & oil prices",
        38: "Housing: heating",
        39: "Russia, Ukraine & US international relations",
        40: "Beauty contests & their winners",
        41: "Arbitration court and the Mostovik case",
        42: "*Military holidays",
        43: "Criminal news",
        44: "Movies and Movie stars",
        45: "Olympic Games 2014 & Omsk athletes",
        46: "Uninterpretable",
        47: "Ivan Klimov’s murder",
        48: "Abridgments of traffic law",
        49: "Regional elections and misc."
    }
    return TOPICS_NUMBERING[id]

In [4]:
db = MongoClient().thesis

In [23]:
db.final_db.find_one()

{'_id': ObjectId('5489d17037a808115d1e6c24'),
 'title': 'В Омске пассажир автобуса украл у кондуктора золото',
 'url': 'http://www.bk55.ru/news/article/37963/',
 'topics': {'0': 0.0002985074626865674,
  '1': 0.0002985074626865674,
  '2': 0.0002985074626865674,
  '3': 0.0002985074626865674,
  '4': 0.0002985074626865674,
  '5': 0.0002985074626865674,
  '6': 0.0002985074626865674,
  '7': 0.0002985074626865674,
  '8': 0.0002985074626865674,
  '9': 0.0002985074626865674,
  '10': 0.0002985074626865674,
  '11': 0.2120111568533553,
  '12': 0.0002985074626865674,
  '13': 0.0002985074626865674,
  '14': 0.0002985074626865674,
  '15': 0.0002985074626865674,
  '16': 0.0002985074626865674,
  '17': 0.0002985074626865674,
  '18': 0.0002985074626865674,
  '19': 0.09570981130061927,
  '20': 0.0002985074626865674,
  '21': 0.0002985074626865674,
  '22': 0.0002985074626865674,
  '23': 0.0002985074626865674,
  '24': 0.0002985074626865674,
  '25': 0.0002985074626865674,
  '26': 0.0002985074626865674,
  '27':

In [28]:
crimea_dict = {}

for num, doc in enumerate(db.final_db.find()):
    print(num, end="\r")
    in_title = "крым" in doc["title"].lower()
    in_content = "крым" in doc["content_raw"].lower()
    if in_title or in_content:
        crimea_dict[doc["url"]] = {
            "in_title": in_title,
            "in_content": in_content,
            "_id": doc["_id"]
        }

33874

In [34]:
crimea = pd.DataFrame.from_dict(crimea_dict, orient="index")

In [35]:
crimea.head()

Unnamed: 0,in_title,in_content,_id
http://gorod55.ru/news/article/014ef3ad-ca08-4abb-a7b1-4e136b12d7da,False,True,5489d9e337a808115d1eb2a5
http://gorod55.ru/news/article/028583fa-5cc1-43c5-f536-9e773e0bbf43,True,True,5489d8bb37a808115d1ea934
http://gorod55.ru/news/article/02c1e5af-442d-4441-ee9f-ea702fd9e0d2,True,True,5489d9bf37a808115d1eb16e
http://gorod55.ru/news/article/0507cbb6-6fd7-4051-e01d-430d3b458c9e,False,True,5489d97637a808115d1eaf41
http://gorod55.ru/news/article/07a7733f-3d5e-4bb2-a462-077fb3545c9c,False,True,5489d9c137a808115d1eb185


In [81]:
topic_results = {}

for _id in crimea["_id"].dropna().tolist():
    doc = db.final_db.find_one({"_id": ObjectId(_id)})
    
    sent_scores = [comment[-1] for comment in doc["comments"]]
    comments_num = len(doc["comments"])
    
    if comments_num:
        avg_comment_sent = sum(sent_scores)/comments_num
        avg_comment_std = np.std(sent_scores)

        for topic, prob in doc["topics"].items():
            topic = get_topic_by_id(int(topic))
            topic_results.setdefault(topic, defaultdict(int))
            topic_results[topic]["comments_num"] += prob * comments_num           
            topic_results[topic]["avg_comment_sent"] += prob * avg_comment_sent
            topic_results[topic]["avg_comment_std"] += prob * avg_comment_std

In [90]:
topics_sum = {}

for _id in tqdm_notebook(crimea["_id"].dropna().tolist()):
    doc = db.final_db.find_one({"_id": ObjectId(_id)})
    
    for topic, prob in doc["topics"].items():
        topic = get_topic_by_id(int(topic))
        
        topics_sum.setdefault(topic, 0)
        topics_sum[topic] += prob

HBox(children=(IntProgress(value=0, max=825), HTML(value='')))




In [97]:
topic_results_df = pd.DataFrame.from_dict(topic_results, orient="columns")
topic_results_df

Unnamed: 0,*Military holidays,"*NATO warships in Black sea, Russian rocket launch & contests","*Omsk media, plants & animals",*Urban development (misc),Abridgments of traffic law,Accidents with children,Ads of banking services,Arbitration court and the Mostovik case,Art and literature,Beauty contests & their winners,...,Stray dogs & doghunters,Street & bridge reconstruction & maintenance,Theaters & festivals,Trials on economic crime,Uninterpretable,Urban demography & housing payments,Urban events & openings,Urban landscaping & greening,Weather,Yury Gamburg resignation
avg_comment_sent,2.863734,1.183613,1.00897,7.034216,0.64874,0.422359,0.755008,1.880956,1.326721,4.37183,...,4.143805,0.654998,1.245163,0.801112,2.956542,2.500416,1.714384,1.656302,0.416344,2.612819
avg_comment_std,5.478044,3.196276,1.961055,16.015276,1.147078,0.728695,1.77034,2.778885,3.125302,5.825103,...,12.246031,1.517704,2.649789,2.100651,6.353951,3.644941,2.988933,2.291127,1.603462,7.102884
comments_num,131.501857,158.845998,37.652366,459.607222,26.124015,20.846283,45.673728,87.740923,77.214581,155.734849,...,412.669912,41.167687,72.938583,66.191943,183.195142,117.765113,82.841915,46.875472,29.239336,317.279639


In [96]:
topics_sum_df = pd.Series(topics_sum)
topics_sum_df

*Military holidays                                                12.218537
*NATO warships in Black sea, Russian rocket launch & contests      7.689874
*Omsk media, plants & animals                                      4.566573
*Urban development (misc)                                         40.813866
Abridgments of traffic law                                         2.171464
Accidents with children                                            2.188863
Ads of banking services                                            5.440356
Arbitration court and the Mostovik case                            8.057398
Art and literature                                                 7.863337
Beauty contests & their winners                                   16.135863
Car accidents                                                      1.441840
Car sales                                                          1.917343
Concerts                                                           6.824793
Control & re

In [98]:
topic_results_df/topics_sum_df

Unnamed: 0,*Military holidays,"*NATO warships in Black sea, Russian rocket launch & contests","*Omsk media, plants & animals",*Urban development (misc),Abridgments of traffic law,Accidents with children,Ads of banking services,Arbitration court and the Mostovik case,Art and literature,Beauty contests & their winners,...,Stray dogs & doghunters,Street & bridge reconstruction & maintenance,Theaters & festivals,Trials on economic crime,Uninterpretable,Urban demography & housing payments,Urban events & openings,Urban landscaping & greening,Weather,Yury Gamburg resignation
avg_comment_sent,0.234376,0.153918,0.220947,0.172349,0.298757,0.192958,0.138779,0.233445,0.168722,0.270939,...,0.143543,0.182004,0.147073,0.127889,0.183784,0.211192,0.228491,0.275715,0.059734,0.172532
avg_comment_std,0.448339,0.415647,0.429437,0.392398,0.528251,0.33291,0.325409,0.344886,0.397452,0.361003,...,0.424209,0.421724,0.312981,0.335346,0.394972,0.307862,0.398361,0.38139,0.230053,0.469023
comments_num,10.762488,20.656516,8.245213,11.261056,12.030598,9.523795,8.395356,10.889486,9.819569,9.651473,...,14.295091,11.439248,8.615186,10.566817,11.387713,9.946778,11.041059,7.80308,4.195039,20.95085
