In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import bs4
from xml.etree import ElementTree as ET
from tqdm import tqdm
import nltk
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import defaultdict
pd.set_option('mode.chained_assignment', None)
%matplotlib inline

In [218]:
# Load SVD topics
df = pd.read_excel('svd_topics.xlsx')
df = df[[col for col in df.columns if 'Unnamed' not in col]]
df.dropna(how = 'all', inplace = True)
df

Unnamed: 0,Generic Python,Django Framework,Web App,Python Data Types,Web Scraping,GUI Tools,Python Version 2.7,Computations Tools,Python Version 3,Data Transformation,Deep Learning,Machine Learning
0,"('python', 0.9943218682525379)","('django', 0.9880502723563694)","('google-app-engine', 0.9983521744661842)","('list', 0.9404907529312443)","('regex', 0.8422051965282726)","('windows', 0.9468411014784877)","('python-2.7', 0.7769092657424509)","('numpy', 0.879251178916456)","('python-3.x', 0.9374201668067704)","('pandas', 0.8268187426571936)","('tensorflow', 0.8531239775608215)","('numpy', 0.9105799158252526)"
1,"('django', 0.09502222831042192)","('django-models', 0.09197634801382917)","('java', 0.016267823489153576)","('string', 0.2367570808609068)","('string', 0.47361778566018253)","('linux', 0.25072191543097255)","('python-3.x', 0.3130541743979987)","('scipy', 0.20412407327137788)","('list', 0.24451087544188038)","('dataframe', 0.27107233849187656)","('keras', 0.3829851406612613)","('tensorflow', 0.23383672826400823)"
2,"('google-app-engine', 0.02812175927549381)","('google-app-engine', 0.03767880399739542)","('javascript', 0.014800116339071318)","('dictionary', 0.19079408926791833)","('parsing', 0.05954310325495384)","('tkinter', 0.1370226656972121)","('regex', 0.19393612047325756)","('matplotlib', 0.1890413242369029)","('dictionary', 0.10853479824120318)","('python-3.x', 0.1636473685165283)","('machine-learning', 0.21301822576905416)","('arrays', 0.16907906792251923)"
3,"('list', 0.011799436896301897)","('django-templates', 0.03135524552257426)","('html', 0.008156884218153734)","('regex', 0.13005489881680654)","('html', 0.05151756918950376)","('macos', 0.06539262338682207)","('tkinter', 0.04106495522121007)","('arrays', 0.12430074120619991)","('string', 0.0705282740423107)","('numpy', 0.1589292431475108)","('deep-learning', 0.15093084767307016)","('keras', 0.15399737416248854)"
4,"('regex', 0.010085786898777175)","('django-forms', 0.03135025797705695)","('json', 0.00797471848220162)","('performance', 0.04302840382490761)","('unicode', 0.04931691668393038)","('c++', 0.04918504736690188)","('windows', 0.023379789050699564)","('performance', 0.025561204029968407)","('tkinter', 0.0629190319545653)","('python-2.7', 0.13618675206703384)","('neural-network', 0.10466030385016119)","('matplotlib', 0.11851924930003034)"
...,...,...,...,...,...,...,...,...,...,...,...,...
104,"('numpy', 0.07211774915423473)","('django-views', 0.0489867075471738)",,,,,,,,,,
105,"('matplotlib', 0.04293898596017816)","('django-forms', 0.04464831693019766)",,,,,,,,,,
106,"('tensorflow', 0.04023276333727933)","('dataframe', 0.03703042026961838)",,,,,,,,,,
107,"('list', 0.033542524307498565)","('django-templates', 0.028733410244083588)",,,,,,,,,,


In [219]:
# Generate a data dictionary containing the average feature importance of each
# component for each derived topic.
col_dct = {}
for d in df:
    vals = df[d]
    splts = [v.split(',') for v in vals if isinstance(v,str)]
    words = [s[0][1:].strip().replace("'",'') for s in splts]
    scores = [float(s[1][:-1].strip().replace("'",'')) for s in splts]

    score_dct = defaultdict(list)
    for word, score in zip(words, scores):
        # Remove low importance tags that were included in the top 10
        if score >= 0.025:
            if score is not None:
                score_dct[word].append(score)
    avg_score_dct = {w: np.mean(s) for w, s in score_dct.items()}
    col_dct[d] = avg_score_dct
col_dct

{'Generic Python': {'python': 0.9597792916546926,
  'django': 0.10973823105053163,
  'google-app-engine': 0.02812175927549381,
  'python-2.7': 0.08126727123022512,
  'python-3.x': 0.14853639645215783,
  'numpy': 0.06197311527429927,
  'list': 0.03623584986248669,
  'pandas': 0.15154188859170675,
  'matplotlib': 0.038621103457072196,
  'regex': 0.027043236283018895,
  'dictionary': 0.027832956779949736,
  'dataframe': 0.07491938291398792,
  'tensorflow': 0.03964324785668549},
 'Django Framework': {'django': 0.965587068095199,
  'django-models': 0.0899321779633778,
  'google-app-engine': 0.03369820839170991,
  'django-templates': 0.03198191976600272,
  'django-forms': 0.035784297867397105,
  'django-admin': 0.028667783130784923,
  'django-views': 0.03599437237386713,
  'django-rest-framework': 0.07403743074685624,
  'mysql': 0.02699128104465149,
  'pandas': 0.16167427209683524,
  'dataframe': 0.06785181835675612,
  'python-3.x': 0.07167030680101887,
  'html': 0.03906596971658305},
 'Web 

In [220]:
# Get all unique tags 
indxs = []
for col in col_dct:
    indxs = indxs + list(col_dct[col].keys())
indxs = list(set(indxs))

In [221]:
# Generate a wide dataframe of the svd_topics with values equal to average 
# feature importance
df_topic_scores = pd.DataFrame(columns = ['words'] + list(df.columns))
df_topic_scores['words'] = sorted(indxs)

for d in df_topic_scores.columns[1:]:
    df_topic_scores[d] = df_topic_scores['words'].map(col_dct[d])
df_topic_scores.fillna(0, inplace = True)
df_topic_scores.head()

Unnamed: 0,words,Generic Python,Django Framework,Web App,Python Data Types,Web Scraping,GUI Tools,Python Version 2.7,Computations Tools,Python Version 3,Data Transformation,Deep Learning,Machine Learning
0,algorithm,0.0,0.0,0.0,0.025714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,arrays,0.0,0.0,0.0,0.052479,0.0,0.0,0.125974,0.235119,0.0,0.089471,0.0,0.169079
2,c++,0.0,0.0,0.0,0.0,0.0,0.049185,0.0,0.0,0.0,0.0,0.0,0.0
3,conv-neural-network,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052507,0.0
4,csv,0.0,0.0,0.0,0.0,0.0,0.0,0.037469,0.0,0.0,0.042849,0.0,0.0


In [222]:
# Convert wide dataframe to long dataframe
df_topic_values = pd.DataFrame(columns = ['words', 'topic', 'score'])
for d in list(df.columns):
    df_topic2 = pd.DataFrame(columns = ['words', 'topic', 'score'])
    df_topic2['words'] = sorted(indxs, reverse = True)
    df_topic2['topic'] = d
    df_topic2['score'] = df_topic2.words.map(dict(zip(df_topic_scores.words,
                                                  df_topic_scores[d])))
    df_topic_values = df_topic_values.append(df_topic2)
df_topic_values.to_csv('word_topic_svd_values.csv', index = False)

In [223]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_topic_values = pd.read_csv('word_topic_svd_values.csv')

# Set up interactive highlighting
fig = go.FigureWidget()
fig.layout.hovermode = 'closest'
fig.layout.hoverdistance = -1 #ensures no "gaps" for selecting sparse data

# Plot Scatter Plots per Topic
for t in df_topic_values.topic.unique():
    subdf_topic_values = df_topic_values[df_topic_values.topic == t]
    fig.add_trace(
            go.Scatter(x = subdf_topic_values.topic,
                       y = subdf_topic_values.words,
                       mode = 'markers',
                       name = t,
                       marker = dict(
                           size = subdf_topic_values.score,
                           sizemode = 'area',
                           sizeref = 2*max(subdf_topic_values.score)/(40**2),
                           sizemin = 1,
                           opacity = 0.6,
                           color='#FFD43B',
                           line=dict(width=1,
                                     color='#FFC622')
                            ),
                       hovertemplate = "Topic : %{x}<br>" +
                                       "Tag : %{y}<br>" +
                                       "Value : %{marker.size}"
                      )
    )
    
# Stylize Figure
fig.update_layout(
    xaxis={'side': 'top',
           'tickangle' : -30},
    width=1000,
    height=1500,
    template='none',
    yaxis_title="<b>Tags</b>",
    legend_title="<b>Derived Topics</b>",
    title={
    'text': "<b>Contribution of Derived SVD Topics<b>",
    'y':1,
    'x':0.5,
    'xanchor': 'center',
    'yanchor': 'top'},
    yaxis_range=[-1,len(df_topic_values.words.unique())],
    xaxis_range=[-1,len(df_topic_values.topic.unique())],
)

# Initially Highlight Data Transformation
fig.data[9].marker['color'] = '#306998'
fig.data[9].marker['line']['color'] = '#0A6F99'
fig.data[9].marker['line']['width'] = 3

def update_trace(trace, points, selector):
    """Change color of selected trace and reset to base color of previously
     selected traces"""
    # this list stores the points which were clicked on
    # in all but one trace they are empty
    if len(points.point_inds) == 0:
        trace.marker['color'] = '#FFD43B'
        trace.marker['line']['color'] = '#FFC622'
        trace.marker['line']['width'] = 1
        return
    
    trace.marker['color'] = '#306998'
    trace.marker['line']['color'] = '#0A6F99'
    trace.marker['line']['width'] = 3
    

# Enable On Click Events for each trace   
for i in range( len(fig.data) ):
    fig.data[i].on_click(update_trace)
    
fig

FigureWidget({
    'data': [{'marker': {'color': '#FFD43B',
                         'line': {'color': '#FFC62…