### Baking pie

*A side project in which I clean data and bake a pie (chart). And a bar chart.*

#### Data cleaning

In [None]:
import pandas
import numpy

data = pandas.read_csv('/home/lindsay/Downloads/softwareInventory.csv', encoding='latin1')
data.head(10)
data.columns

data.columns = ['ID', 'Device', 'Matched', 'Installation', 'Pathway', 'Application', 'Version']

# We don't need the 'Unnamed: 1' column (renamed 'Matched') that contains "matched" in every row- drop it:
data = data.drop(['Matched'], axis=1)
data.columns

# Identify missing data
empty = data.apply(lambda col: pandas.isnull(col))
empty.head(10)

#       ID  Device  Installation  Pathway  Application  Version
# 0   True    True          True     True         True     True
# 1  False   False         False     True        False    False
# 2   True    True          True     True         True     True

# Above, we see that the empty rows inherently encoded in the CSV are boolean True for pandas.isnull
# Let's drop all rows where Application is marked True- we know these are empty rows

data = data.dropna(subset=['Application'])

# Make a list of applications, then coerce to list of unique values
app_list = data['Application'].values
unique_app_list = numpy.unique(app_list)
# Oh...there are500+ elements here...best to bin by string occurrences by software

# Microsoft: 'Microsoft' 'Win', 'Windows' 'Visual Studio' 'vs_' 'VS' 'Office' 'SQL' 'sql'
# Oracle: 'Adobe' 'Java'
# Apple: 'Apple'
# Autodesk: 'Autodesk'
# CheckPoint: 'Check Point'
# Dell: 'Dell'
# Intel: 'Intel'
# Other

data['Software'] = numpy.where(data.Application.str.contains("Microsoft"), "Microsoft",
                   numpy.where(data.Application.str.contains("Win"), "Microsoft",
                   numpy.where(data.Application.str.contains("Windows"), "Microsoft",
                   numpy.where(data.Application.str.contains("Visual Studio"), "Microsoft",
                   numpy.where(data.Application.str.contains("vs_"), "Microsoft",
                   numpy.where(data.Application.str.contains("VS"), "Microsoft",
                   numpy.where(data.Application.str.contains("Office"), "Microsoft",
                   numpy.where(data.Application.str.contains("SQL"), "Microsoft",
                   numpy.where(data.Application.str.contains("sql"), "Microsoft",
                   numpy.where(data.Application.str.contains("Adobe"), "Oracle",
                   numpy.where(data.Application.str.contains("Java"), "Oracle",
                   numpy.where(data.Application.str.contains("Apple"), "Apple",
                   numpy.where(data.Application.str.contains("Autodesk"), "Autodesk",
                   numpy.where(data.Application.str.contains("Check Point"), "CheckPoint",
                   numpy.where(data.Application.str.contains("Dell"), "Dell",
                   numpy.where(data.Application.str.contains("Intel"), "Intel", "Other"))))))))))))))))

software_list = data['Software'].values
unique_software_list = numpy.unique(software_list, return_counts = True) # Looks reasonable

# Save dataframe to CSV
# data.to_csv(r'/path/filename.csv')

#### Creating a pie chart...

In [10]:
# data.head(10)

In [9]:
# Count occurrences of each Software type
'''
from collections import Counter
Counter(data.Software)
'''

'\nfrom collections import Counter\nCounter(data.Software)\n'

In [11]:
# Create a pie chart
'''
import plotly.graph_objects as go
colors = ['rgb(56, 75, 126)','rgb(18, 36, 37)','rgb(34, 53, 101)',
          'rgb(36, 55, 57)','rgb(175, 51, 21)','rgb(206, 206, 40)',
          'rgb(175, 51, 21)','rgb(35, 36, 21)']
values = [1232,193,51,44,43,17,3,521]
labels=['Microsoft','Intel','Autodesk','Oracle','Dell','Check Point','Apple','Other']

fig = go.Figure(data=[go.Pie(labels=labels,values=values)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Counts of software installations across all machines')
fig.show()
'''

"\nimport plotly.graph_objects as go\ncolors = ['rgb(56, 75, 126)','rgb(18, 36, 37)','rgb(34, 53, 101)',\n          'rgb(36, 55, 57)','rgb(175, 51, 21)','rgb(206, 206, 40)',\n          'rgb(175, 51, 21)','rgb(35, 36, 21)']\nvalues = [1232,193,51,44,43,17,3,521]\nlabels=['Microsoft','Intel','Autodesk','Oracle','Dell','Check Point','Apple','Other']\n\nfig = go.Figure(data=[go.Pie(labels=labels,values=values)])\nfig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,\n                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))\nfig.update_layout(title_text='Counts of software installations across all machines')\nfig.show()\n"

In [6]:
# Count the number of devices
# Counter(data.Device)

In [5]:
# These lines create an interactive stacked bar chart. Commented out for upload ease.
'''
import plotly.express as px

fig = px.bar(data, x='Software', y='Software', color='Device', orientation='h',
             hover_data=["Software", "Version"],
             height=800,
             title='Software installations per device (hover for versions)')
fig.show()
'''


'\nimport plotly.express as px\n\nfig = px.bar(data, x=\'Software\', y=\'Software\', color=\'Device\', orientation=\'h\',\n             hover_data=["Software", "Version"],\n             height=800,\n             title=\'Software installations per device (hover for versions)\')\nfig.show()\n'