In [2]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import NMF

In [3]:
data = pd.read_csv('un-general-debates.csv')

In [8]:
data.head()

Unnamed: 0,session,year,country,text
0,44,1989,MDV,﻿It is indeed a pleasure for me and the member...
1,44,1989,FIN,"﻿\nMay I begin by congratulating you. Sir, on ..."
2,44,1989,NER,"﻿\nMr. President, it is a particular pleasure ..."
3,44,1989,URY,﻿\nDuring the debate at the fortieth session o...
4,44,1989,ZWE,﻿I should like at the outset to express my del...


In [5]:
stops = set(nltk.corpus.stopwords.words('english'))

In [7]:
bow = CountVectorizer(stop_words=stops)
bag = bow.fit_transform(data.text)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(bag, data.country, random_state=3)

In [14]:
X = X_train.toarray()

In [19]:
X.shape

(5630, 54754)

In [17]:
nmf = NMF(n_components=4)

In [20]:
W = nmf.fit_transform(X)

In [21]:
H = nmf.components_

In [22]:
words = bow.get_feature_names()
words = np.array(words)

In [28]:
words[np.argsort(H, axis=1)][:,-20:]

array([['also', 'general', 'african', 'problems', 'assembly',
        'community', 'government', 'us', 'new', 'country', 'people',
        'south', 'must', 'development', 'developing', 'africa',
        'international', 'economic', 'world', 'countries'],
       ['assembly', 'nuclear', 'council', 'one', 'organization', 'also',
        'rights', 'us', 'security', 'global', 'peace', 'general', 'new',
        'development', 'human', 'states', 'must', 'world', 'united',
        'nations'],
       ['policy', 'independence', 'would', 'arab', 'security',
        'relations', 'soviet', 'military', 'war', 'international',
        'nuclear', 'republic', 'countries', 'nations', 'peoples',
        'world', 'peace', 'united', 'people', 'states'],
       ['stability', 'assembly', 'support', 'cooperation', 'human',
        'process', 'region', 'rights', 'political', 'country',
        'economic', 'also', 'council', 'general', 'efforts',
        'development', 'community', 'peace', 'security', 'interna

In [25]:
countries = np.array(data.country)

In [27]:
countries[np.argsort(W, axis=0)[-30:,:]].T

array([['ZMB', 'PRY', 'FIN', 'CAF', 'CPV', 'EGY', 'COM', 'COG', 'CAF',
        'ARE', 'LBN', 'NAM', 'GNQ', 'PAK', 'PAK', 'DZA', 'FRA', 'UKR',
        'ZMB', 'AUT', 'AUS', 'KIR', 'CHE', 'DZA', 'CHE', 'BHS', 'NIC',
        'IND', 'LKA', 'MUS'],
       ['GBR', 'AUS', 'STP', 'SYR', 'TJK', 'GRD', 'DZA', 'GRD', 'STP',
        'MNE', 'UKR', 'BGR', 'SOM', 'NOR', 'TGO', 'FIN', 'CAN', 'MRT',
        'GTM', 'LBR', 'MUS', 'ZMB', 'ETH', 'CUB', 'ZMB', 'USA', 'COD',
        'STP', 'PRT', 'RUS'],
       ['IND', 'ZWE', 'SYR', 'HND', 'POL', 'PRY', 'BDI', 'ECU', 'PRK',
        'ARG', 'POL', 'MEX', 'UKR', 'IND', 'AND', 'DEU', 'GNB', 'DMA',
        'GNQ', 'TUN', 'GIN', 'NAM', 'NIC', 'THA', 'GUY', 'LBY', 'RUS',
        'HND', 'BGR', 'ARG'],
       ['SLV', 'IDN', 'SWZ', 'AGO', 'MNE', 'MNG', 'USA', 'ARG', 'ISR',
        'NPL', 'LUX', 'LCA', 'BTN', 'GAB', 'HTI', 'MYS', 'IDN', 'LBY',
        'SUR', 'RWA', 'VCT', 'LIE', 'EGY', 'GRD', 'GNQ', 'GNQ', 'MWI',
        'MDG', 'CHE', 'LCA']], dtype=object)