In [108]:
import pickle 
import numpy as np 
import pandas as pd 
import datapane as dp 
import base64

!unzip test_df.zip



Archive:  test_df.zip
  inflating: test_df.csv             


In [110]:
new_profile = pd.read_csv('test_df.csv')

In [None]:
#Find percentage of missing values
new_profile.isnull().sum()/len(new_profile)

In [None]:
new_profile.head(10)

# Visualization

## Bar graphs

In [None]:
import altair as alt 
import matplotlib.pyplot as plt
import numpy as np

In [None]:
import plotly.express as px


In [None]:
fig = px.bar(top_followers, 
             x='user_name', 
             y='followers',
             hover_data=['followers'],
            )

fig.update_layout({'plot_bgcolor': 'rgba(36, 83, 97, 0.06)'}) #Change background color


fig.show()

In [None]:
top_n = int(len(top_followers) * 0.01)
top_n

In [None]:
sum(top_followers.iloc[0: top_n,:].loc[:, 'followers'])/sum(top_followers.followers)

In [None]:
features = ['followers',
               'following',
               'total_stars',
               'max_star',
               'forks',
           'contribution']
figs = []
for col in features:
    top_col = new_profile.sort_values(by=col, axis=0, ascending=False)
    
    log_y = False
    
    # #change scale of y-axis of every feature to log except contribution
    if col != 'contribution':
        log_y = True
    
    fig = px.bar(top_col,
             x='user_name', 
             y=col,
             hover_data=[col],
            log_y=log_y, 
            )
    
    fig.update_layout({'plot_bgcolor': 'rgba(36, 83, 97, 0.06)'})
    
    fig.show()
    fig = dp.Plot(fig)
    figs.append(fig)



## Correlation

In [None]:
correlation = px.scatter_matrix(new_profile, dimensions=['forks', 'total_stars', 'followers',
                                 'following', 'max_star','contribution'],
                               title='Correlation between datapoints',
                               width=800, height=800)

correlation.show()

figs.append(dp.Plot(correlation))

In [None]:
corr = new_profile.corr()
figs.append(dp.Table(corr))

## Languages

In [None]:
languages = []
for language in new_profile['languages']:
    try:
        languages += languages
    except:
        languages += ['None'] 

In [None]:
from collections import Counter 

occ = dict(Counter(languages))
occ 

In [None]:
top_languages = [(language, frequency) for language, frequency in occ.items() if frequency > 10]

top_languages = list(zip(*top_languages))

language_df = pd.DataFrame(data = {'languages': top_languages[0],
                           'frequency': top_languages[1]})

#language_df.loc[language_df['frequency'] < 30, 'languages'] = 'Other languages'

language_df.sort_values(by='frequency', axis=0, inplace=True, ascending=False)

language = px.bar(language_df, y='frequency', x='languages',
      title='Frequency of languages')

figs.append(dp.Plot(language))

In [None]:
language.show()

## Hireable

In [None]:
import altair as alt

hireable = alt.Chart(new_profile).transform_aggregate(
    count='count()',
    groupby=['hireable']
).mark_bar().encode(
    x='hireable:O',
    y='count:Q')

figs.append(dp.Plot(hireable))

In [None]:
hireable

## Locations

In [None]:
from geopy.geocoders import Nominatim
import folium

locations = list(new_profile['location'])

lats = []
lons = []
exceptions = []

for loc in locations:
    try:
        location = geolocator.geocode(loc)
        lats.append(location.latitude)
        lons.append(location.longitude)
    except:
        exceptions.append(loc)

In [None]:
location_df = new_profile[~new_profile.location.isin(exceptions)]

In [None]:
location_df['latitude'] = lats
location_df['longitude'] = lons

In [None]:
location_df =location_df.dropna(axis=0, subset=['longitude'])

In [None]:
# Visualize with Plotly's scatter_geo
m = px.scatter_geo(location_df.fillna(0), lat='latitude', lon='longitude',
                color='total_stars', 
                size='forks',
                hover_data=['user_name','followers'],
                 title='Locations of Top Users')
m.show()

figs.append(dp.Plot(m))

In [None]:
figs.append(dp.Plot(m))

## Word Clouds of Descriptions and Bios


In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

      
def process_text(features):
    '''Function to process texts'''
    
    features = [row for row in features if row != None]
    
    text = ' '.join(features)
    
    
    
    # lowercase
    text = text.lower()

    #remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    #remove stopwords
    stop_words = set(stopwords.words('english'))

    #tokenize
    tokens = word_tokenize(text)
    new_text = [i for i in tokens if not i in stop_words]
    
    new_text = ' '.join(new_text)
    
    return new_text

def make_wordcloud(new_text):
    '''Funciton to make wordcloud'''
    
    wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                min_font_size = 10).generate(new_text) 

    
    fig = plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    plt.show() 
    
    return fig
    
descriptions = []
for desc in new_profile['descriptions']:
    try:
        descriptions += desc
        
    except:
        pass

descriptions = process_text(descriptions)

cloud = make_wordcloud(descriptions)

figs.append(dp.Plot(cloud))

In [None]:
bios = []
for bio in new_profile['bio']:
    try:
        bios.append(bio)
        
    except:
        pass
      
text = process_text(bios)

cloud = make_wordcloud(text)

figs.append(dp.Plot(cloud))

In [None]:
dp.Report(*figs).publish(name='finding')