In [1]:
import json
import scipy as sp
import numpy as np
import pandas as pd
import sys
import string
#from pgmpy.inference import VariableElimination
#from pgmpy.models import BayesianModel
from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
from libpgm.lgbayesiannetwork import LGBayesianNetwork
from libpgm.hybayesiannetwork import HyBayesianNetwork
from libpgm.dyndiscbayesiannetwork import DynDiscBayesianNetwork
from libpgm.tablecpdfactorization import TableCPDFactorization
from libpgm.sampleaggregator import SampleAggregator
from libpgm.pgmlearner import PGMLearner

In [2]:
df = pd.read_csv("../data/data.csv")

In [3]:
df.head()

Unnamed: 0,user_id,movie_title,rating,genre,release_date,age,gender,occupation
0,100,Air Force One (1997),4,multiple,01-01-1997,36,M,executive
1,100,Amistad (1997),4,Drama,18-12-1997,36,M,executive
2,100,Anna Karenina (1997),3,multiple,04-04-1997,36,M,executive
3,100,"Apostle, The (1997)",4,Drama,18-12-1997,36,M,executive
4,100,Apt Pupil (1998),5,multiple,23-10-1998,36,M,executive


In [4]:
df.describe()

Unnamed: 0,user_id,rating,age
count,99739.0,99739.0,99739.0
mean,462.530284,3.529913,32.971907
std,266.618279,1.125528,11.560293
min,1.0,1.0,7.0
25%,254.0,3.0,24.0
50%,447.0,4.0,30.0
75%,682.0,4.0,40.0
max,943.0,5.0,73.0


In [5]:
df.corr()

Unnamed: 0,user_id,rating,age
user_id,1.0,-0.009166,-0.073599
rating,-0.009166,1.0,0.05457
age,-0.073599,0.05457,1.0


In [6]:
df.shape

(99739, 8)

In [7]:
df['rating'].value_counts()

4    34079
3    27084
5    21146
2    11342
1     6088
Name: rating, dtype: int64

In [12]:
text = open("../data/unifiedMLData2.json")
data=text.read()

In [13]:
data[0:100]

'[\n  {\n    "user_id":100,\n    "movie_title":"Air Force One (1997)",\n    "rating":4,\n    "genre":"mult'

In [14]:
#data=data[0:1000]

In [15]:
listofDicts = json.loads(data)
print(listofDicts[0:5])

[{'user_id': 100, 'movie_title': 'Air Force One (1997)', 'rating': 4, 'genre': 'multiple', 'release_date': '1997-01-01', 'age': 36, 'gender': 'M', 'occupation': 'executive'}, {'user_id': 100, 'movie_title': 'Amistad (1997)', 'rating': 4, 'genre': 'Drama', 'release_date': '1997-12-18', 'age': 36, 'gender': 'M', 'occupation': 'executive'}, {'user_id': 100, 'movie_title': 'Anna Karenina (1997)', 'rating': 3, 'genre': 'multiple', 'release_date': '1997-04-04', 'age': 36, 'gender': 'M', 'occupation': 'executive'}, {'user_id': 100, 'movie_title': 'Apostle, The (1997)', 'rating': 4, 'genre': 'Drama', 'release_date': '1997-12-18', 'age': 36, 'gender': 'M', 'occupation': 'executive'}, {'user_id': 100, 'movie_title': 'Apt Pupil (1998)', 'rating': 5, 'genre': 'multiple', 'release_date': '1998-10-23', 'age': 36, 'gender': 'M', 'occupation': 'executive'}]


In [16]:
skel = GraphSkeleton()
skel.load("../data/skeleton.json")
#"V": ["user_id","movie_title", "rating", "genre", "release_date", "age", "gender", "occupation"],
#"E": [["occupation", "rating"],["gender","rating"],["age","rating"],["age","occupation"],["gender","occupation"],["genre","movie_title"],["movie_title","rating"],['user_id','rating']]

**GraphSkeleton** - This class represents a graph skeleton, meaning a vertex set and a directed edge set. It contains the attributes V and E, and the methods load, getparents, getchildren, and toporder.

In [17]:
print(skel.toporder())

None


In [18]:
print(skel.getparents('rating'))

['occupation', 'gender', 'age', 'movie_title', 'user_id']


In [19]:
print(skel)

<libpgm.graphskeleton.GraphSkeleton object at 0x000001E6CFF45B70>


In [20]:
occupations = ['administrator',
#'artist',
#'scientist',
'student',
'doctor',
'educator',
'engineer',
'entertainment',
'executive',
'healthcare',
'homemaker',
'lawyer',
'librarian',
'marketing',
'none',
'other',
'programmer',
'retired',
'salesman',
'technician',
'writer']

In [21]:
occupations[1]

'student'

# Creating a Bayesian Network

In [22]:
learner = PGMLearner()

**PGMLearner** - This module provides tools to generate Bayesian networks that are “learned” from a data set. The learning process involves finding the Bayesian network that most accurately models data given as input – in other words, finding the Bayesian network that makes the data set most likely.

There are two major parts of Bayesian network learning: **structure learning and parameter learning**. 

Structure learning means finding the graph that most accurately depicts the dependencies detected in the data. Parameter learning means adjusting the parameters of the CPDs in a graph skeleton to most accurately model the data. This module has tools for both of these tasks.

# Maximum Likelihood Estimation - To learn the parameters of the CPDs

We instantiated the PGMLearner class. The method discrete_mle_estimateparams already knows the structure of the network. The estimates for each CPD only needs information from the parent, and this decomposition makes it possible to learn the parameters of each CPD.

In [19]:
result = learner.discrete_mle_estimateparams(skel, listofDicts)

**Estimate parameters for a discrete Bayesian network with a structure given by graphskeleton in order to maximize the probability of data given by data** This function normalizes the distribution of a node’s outcomes for each combination of its parents’ outcomes. In doing so it creates an estimated tabular conditional probability distribution for each node. It then instantiates a DiscreteBayesianNetwork instance based on the graphskeleton, and modifies that instance’s Vdata attribute to reflect the estimated CPDs. It then returns the instance.

In [20]:
# output- same as if you would have provided by yourself in NodeData
print(json.dumps(result.Vdata,indent=2)[0:200])


KeyboardInterrupt



In [None]:
pd.DataFrame(result.Vdata['rating']['cprob']).transpose().head()

In [24]:
myquery = dict(movie_title = ['Air Force One (1997)'])
myevidence = dict(user_id=11)
result.specificquery(query=myquery,evidence = myevidence)

0.004871322671166215

# Gibbs Sampling

**Return a sequence of n samples using the Gibbs sampling method, given evidence specified by evidence.** Gibbs sampling is a technique wherein for each sample, each variable in turn is erased and calculated conditioned on the outcomes of its neighbors. This method starts by sampling from the ‘prior distribution,’ which is the distribution not conditioned on evidence, but the samples provably get closer and closer to the posterior distribution, which is the distribution conditioned on the evidence. It is thus a good way to deal with evidence when generating random samples.

In [23]:
text = open("../data/unifiedMLData2.json")
data = text.read()
listofDicts=json.loads(data)

skel = GraphSkeleton()
skel.load("../data/skeleton.json")

learner = PGMLearner()

result = learner.discrete_mle_estimateparams(skel, listofDicts)

tcf = TableCPDFactorization(result)

myquery = dict(movie_title = ['Air Force One (1997)'])
myevidence = dict(user_id=11)
res2 = tcf.gibbssample(evidence = myevidence,n = 5)


KeyboardInterrupt



In [19]:
print(json.dumps(res2, indent=2))

[
  {
    "user_id": 10,
    "release_date": "1997-01-01",
    "genre": "multiple",
    "movie_title": "Tales from the Hood (1995)",
    "gender": "M",
    "age": 19,
    "occupation": "student",
    "rating": 5
  },
  {
    "user_id": 109,
    "release_date": "1968-01-01",
    "genre": "multiple",
    "movie_title": "African Queen, The (1951)",
    "gender": "M",
    "age": 20,
    "occupation": "student",
    "rating": 5
  },
  {
    "user_id": 116,
    "release_date": "1991-01-01",
    "genre": "multiple",
    "movie_title": "Don Juan DeMarco (1995)",
    "gender": "M",
    "age": 27,
    "occupation": "student",
    "rating": 4
  },
  {
    "user_id": 110,
    "release_date": "1996-03-08",
    "genre": "multiple",
    "movie_title": "Professional, The (1994)",
    "gender": "M",
    "age": 20,
    "occupation": "student",
    "rating": 5
  },
  {
    "user_id": 10,
    "release_date": "1996-04-26",
    "genre": "multiple",
    "movie_title": "English Patient, The (1996)",
    "gend

# Movielens test

In [238]:
#values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),columns=['A', 'B', 'C', 'D', 'E'])

values1 = pd.read_csv('../data/unifiedMLData.csv')
values = pd.DataFrame(values1,columns=['movie_title', 'genre', 'age','occupation','rating','gender'])
s = values.reset_index().to_json(orient='records')

#a = values.to_json()
#print a
model = BayesianModel([('genre','movie_title'), ('age', 'occupation'),('occupation','rating'),('movie_title','rating')])
model.fit(values)

print(model.get_cpds('genre'))
inference = VariableElimination(model)
phi_query = inference.query(['age', 'occupation'])
print(phi_query['occupation'])

╒════════════════════╤═════════════╕
│ genre(Action)      │ 0.008813    │
├────────────────────┼─────────────┤
│ genre(Adventure)   │ 0.00324848  │
├────────────────────┼─────────────┤
│ genre(Animation)   │ 0.000721884 │
├────────────────────┼─────────────┤
│ genre(Childrens)   │ 0.000180471 │
├────────────────────┼─────────────┤
│ genre(Comedy)      │ 0.0985272   │
├────────────────────┼─────────────┤
│ genre(Crime)       │ 0.00324848  │
├────────────────────┼─────────────┤
│ genre(Documentary) │ 0.00648693  │
├────────────────────┼─────────────┤
│ genre(Drama)       │ 0.132355    │
├────────────────────┼─────────────┤
│ genre(Fantasy)     │ 1.00262e-05 │
├────────────────────┼─────────────┤
│ genre(Film-Noir)   │ 0.000671753 │
├────────────────────┼─────────────┤
│ genre(Horror)      │ 0.0156208   │
├────────────────────┼─────────────┤
│ genre(Musical)     │ 0.00319835  │
├────────────────────┼─────────────┤
│ genre(Mystery)     │ 0.00201526  │
├────────────────────┼─────────────┤
│

  phi.values = phi.values[slice_]
  phi1.values = phi1.values[slice_]


In [31]:
myevidence = dict(gender='F')
res2 = []

for occu in occupations:
    myquery = dict(occupation=[occu])
    res2 = tcf.condprobve(query = myquery, evidence = myevidence)
    #res2 = tcf.specificquery(query=myquery,evidence=myevidence)
    res2 = json.dumps(res2.vals)
    print(res2)

[0.07240719479525455, 0.07240719479525455, 0.07240719479525455, 0.07240719479525455, 0.07240719479525455, 0.07240719479525455, 0.07240719479525455, 0.07240719479525455, 0.13111366245694547, 0.07240719479525455, 0.07240719479525455, 0.07240719479525455, 0.07240719479525455]


**condprobve** - Eliminate all variables in factorlist except for the ones queried. Adjust all distributions for the evidence given. Return the probability distribution over a set of variables given by the keys of query given evidence.

**Arguments:**

query – A dict containing (key: value) pairs reflecting (variable: value) that represents what outcome to calculate the probability of.

evidence – A dict containing (key: value) pairs reflecting (variable: value) that represents what is known about the system.

**Attributes modified:**
factorlist – Modified to be one factor representing the probability distribution of the query variables given the evidence.

In [32]:
res2=res2.strip("[]").split(',')

In [33]:
y=[float(i) for i in res2]

In [41]:
mle = y[0]

In [42]:
mle

0.07240719479525455

In [40]:
for i in range(1,len(y)-1):
    mle = max(y[i-1],y[i])
print("mle of occupation given gender is Female")
print(mle)

mle of occupation given gender is Female
0.07240719479525455
