In [111]:
import pandas as pd
import json
import fasttext

# Loading merged data from channels

In [112]:
df_channels = pd.read_csv('/Users/katerynaburovova/PycharmProjects/dehumanization/data/merged_dataset/df_channels.csv', header=None)

In [113]:
df_channels.columns = df_channels.iloc[0]
df_channels = df_channels[1:]

In [114]:
df_channels = (df_channels
               .drop(columns=['Unnamed: 0'], axis=1)
               .reset_index(drop=True))

In [115]:
df_channels

Unnamed: 0,id,date,views,reactions,to_id,fwd_from,message,type,duration,frw_from_title,frw_from_name,channel_name
0,2318,2022-11-21 12:17:27+00:00,2993.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1182607945),"MessageFwdHeader(date=datetime.datetime(2022, ...",Поэт Анна Долгарева проведет творческую встреч...,photo,,Мастерская Лундстрема,lundstremart,dolgarevaanna
1,115262,2022-11-21 12:58:51+00:00,,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1591547950),,Ссылку на канал будьте добры,text,,,,dolgarevaanna
2,115260,2022-11-21 12:57:05+00:00,,,PeerChannel(channel_id=1591547950),,От Александра Сергеевича.,text,,,,dolgarevaanna
3,115252,2022-11-21 12:51:14+00:00,,,PeerChannel(channel_id=1591547950),,"Светлана, а Вы от кого пришли, знаток поэзии?",text,,,,dolgarevaanna
4,115251,2022-11-21 12:50:46+00:00,,,PeerChannel(channel_id=1591547950),,Примялось? Пусть. Это все равно ничего не мен...,text,,,,dolgarevaanna
...,...,...,...,...,...,...,...,...,...,...,...,...
186,24120,2022-11-21 09:15:14+00:00,270058.0,,PeerChannel(channel_id=1497011710),,25 ноября в мире будет отмечаться Международны...,video,56.0,,,bloodysx
187,24119,2022-11-21 09:00:50+00:00,286726.0,,PeerChannel(channel_id=1497011710),,Комитет Госдумы по информационной политике счи...,video,46.0,,,bloodysx
188,24118,2022-11-21 07:55:21+00:00,311880.0,,PeerChannel(channel_id=1497011710),,В Госдуме задумались над увеличением зарплат в...,video,35.0,,,bloodysx
189,24117,2022-11-21 07:02:16+00:00,338178.0,,PeerChannel(channel_id=1497011710),,"Строго говоря, это не «набрал» с нуля, это вос...",text,,,,bloodysx


### Some redundant code for id extraction

In [116]:
# def extract_id(text):
#     if text is None:
#         return ""
#     else:
#         pos = text.find('channel_id=')+len('channel_id=')
#         result = text[pos:(pos+25)].split('),')[0]
#         return result

In [117]:
# df_channels['frw_from_id_clean'] = df_channels['fwd_from'].apply(lambda x: extract_id(str(x)))

In [118]:
# indices = (df_channels['frw_from_id_clean']
#               .value_counts()
#               .reset_index(name="count")[1:]['index']
#               .to_list())

### Loading list of unique channel name handles referenced by the group

In [121]:
names = (df_channels['frw_from_name']
              .value_counts()
              .reset_index(name="count")[1:]['index']
              .to_list())
names

['milinfolive',
 'rt_special',
 'tass_agency',
 'lundstremart',
 'dolgareva',
 'kpdlit',
 'smotri_z',
 'Soldieroffortune777',
 'RVvoenkor',
 'Z_memes']

In [122]:
# dict_id = {"ids": indices}
# json_object = json.dumps(dict_id, indent=4)
# with open("/Users/katerynaburovova/PycharmProjects/dehumanization/data/ids_channels_list.json", "w") as outfile:
#     outfile.write(json_object)

In [123]:
dict_names = {"titles": names}
json_object = json.dumps(dict_names, indent=4)
with open("/Users/katerynaburovova/PycharmProjects/dehumanization/data/names_channels_list.json", "w") as outfile:
    outfile.write(json_object)

# Exploring the data

In [124]:
class LanguageIdentification:

    def __init__(self):
        pretrained_lang_model = "/Users/katerynaburovova/PycharmProjects/comp_soc_sci_projects/fasttext/lid.176.bin"
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text, label_only=True):
        predictions = self.model.predict(text, k=1)
        if label_only:
            return predictions[0][0][-2:]
        return predictions

In [125]:
lang_identifier = LanguageIdentification()
df_channels['message'] = df_channels['message'].apply(lambda x: str(x).replace('\n', ' '))
df_channels['lang'] = df_channels['message'].apply(lambda x: lang_identifier.predict_lang(x))



In [126]:
df_channels['lang'].value_counts()

ru    167
ht     16
bg      2
en      2
ja      2
uk      1
es      1
Name: lang, dtype: int64

In [127]:
df_channels[df_channels['lang']=='uk']

Unnamed: 0,id,date,views,reactions,to_id,fwd_from,message,type,duration,frw_from_title,frw_from_name,channel_name,lang
25,115254,2022-11-21 12:51:37+00:00,,,PeerChannel(channel_id=1591547950),,Ну-ну,text,,,,dolgarevaanna,uk


# Set up for the influence maximization problem

Influence maximization is the problem of finding a small set of most influential nodes in a social network so that their aggregated influence in the network is maximized (maximizing the information diffusion). The most widely used diffusion models include classical epidemic outbreak model, independent cascade model, linear threshold model, Weight Cascade(wc), trigger and rumor-spreading model (with independent cascade model, linear threshold model being more popular approaches).

We could either use the threshold for channel to qualify, or use some kind of connectivity (centrality?) measure to estimate the influence and include top-X channel out of all the channels we've been able to identify.

We collect the info on channels referenced by the ones we parse (id, title, name handle) to use for building the graph. We also use views as a proxy for current audience estimate.

In [128]:
df_channels.head(5)

Unnamed: 0,id,date,views,reactions,to_id,fwd_from,message,type,duration,frw_from_title,frw_from_name,channel_name,lang
0,2318,2022-11-21 12:17:27+00:00,2993.0,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1182607945),"MessageFwdHeader(date=datetime.datetime(2022, ...",Поэт Анна Долгарева проведет творческую встреч...,photo,,Мастерская Лундстрема,lundstremart,dolgarevaanna,ru
1,115262,2022-11-21 12:58:51+00:00,,MessageReactions(results=[ReactionCount(reacti...,PeerChannel(channel_id=1591547950),,Ссылку на канал будьте добры,text,,,,dolgarevaanna,ru
2,115260,2022-11-21 12:57:05+00:00,,,PeerChannel(channel_id=1591547950),,От Александра Сергеевича.,text,,,,dolgarevaanna,ru
3,115252,2022-11-21 12:51:14+00:00,,,PeerChannel(channel_id=1591547950),,"Светлана, а Вы от кого пришли, знаток поэзии?",text,,,,dolgarevaanna,ru
4,115251,2022-11-21 12:50:46+00:00,,,PeerChannel(channel_id=1591547950),,Примялось? Пусть. Это все равно ничего не мен...,text,,,,dolgarevaanna,ru
